From 8a89c1623735fc24b6ab453b200b42dc589547e2 Mon Sep 17 00:00:00 2001 From: Tom Beckenham <34339192+tombeckenham@users.noreply.github.com> Date: Fri, 22 May 2026 19:54:49 +1000 Subject: [PATCH 01/11] feat(ai): add imageInputs / videoInputs / audioInputs for image-conditioned generation (closes #618) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds optional `imageInputs`, `videoInputs`, and `audioInputs` to `generateImage()` and `generateVideo()` for image-to-image, multi-reference, mask / inpaint, image-to-video, and starting-frame flows. Each input part may carry a `metadata.role` hint (`'reference' | 'mask' | 'control' | 'start_frame' | 'end_frame' | 'character'`) that adapters use to route to the provider-specific field. Provider behavior: - OpenAI image: gpt-image-1 / -mini route to `images.edit()` (up to 16 + mask); dall-e-2 routes to `images.edit()` with one source; dall-e-3 throws. - OpenAI video: Sora-2 / -pro accept a single `input_reference`; throws on >1. - Gemini: native models receive inputs as multimodal `contents` parts; Imagen throws (text-only). - fal: 1 input → `image_url`, >1 → `image_urls`; metadata roles map to `mask_url` / `control_image_url` / `reference_image_urls`; video adds `start_image_url` / `end_image_url`. Interim mapping until the fal schemas library lands. - Grok, OpenRouter: throw with a link back to #618 (pending native Imagine API rewrite and multimodal injection work respectively). Co-Authored-By: Claude Opus 4.7 (1M context) --- .changeset/image-and-video-inputs.md | 22 +++ docs/media/image-generation.md | 111 ++++++++++++ docs/media/video-generation.md | 68 +++++++ packages/ai-event-client/src/index.ts | 6 + packages/ai-fal/src/adapters/image.ts | 17 ++ packages/ai-fal/src/adapters/video.ts | 73 +++++++- packages/ai-fal/src/image/image-inputs.ts | 165 +++++++++++++++++ packages/ai-fal/tests/image-inputs.test.ts | 140 +++++++++++++++ packages/ai-gemini/src/adapters/image.ts | 90 +++++++++- packages/ai-grok/src/adapters/image.ts | 12 ++ packages/ai-openai/src/adapters/image.ts | 168 +++++++++++++++++- packages/ai-openai/src/adapters/video.ts | 24 +++ .../src/image/image-input-to-file.ts | 70 ++++++++ .../ai-openai/tests/image-adapter.test.ts | 157 ++++++++++++++++ packages/ai-openrouter/src/adapters/image.ts | 12 ++ .../skills/ai-core/media-generation/SKILL.md | 125 ++++++++++++- .../ai/src/activities/generateImage/index.ts | 23 ++- .../ai/src/activities/generateVideo/index.ts | 42 ++++- packages/ai/src/types.ts | 62 +++++++ testing/e2e/src/lib/feature-support.ts | 10 ++ testing/e2e/src/lib/types.ts | 4 + 21 files changed, 1393 insertions(+), 8 deletions(-) create mode 100644 .changeset/image-and-video-inputs.md create mode 100644 packages/ai-fal/src/image/image-inputs.ts create mode 100644 packages/ai-fal/tests/image-inputs.test.ts create mode 100644 packages/ai-openai/src/image/image-input-to-file.ts diff --git a/.changeset/image-and-video-inputs.md b/.changeset/image-and-video-inputs.md new file mode 100644 index 000000000..3620076c0 --- /dev/null +++ b/.changeset/image-and-video-inputs.md @@ -0,0 +1,22 @@ +--- +'@tanstack/ai': minor +'@tanstack/ai-openai': minor +'@tanstack/ai-gemini': minor +'@tanstack/ai-fal': minor +'@tanstack/ai-grok': patch +'@tanstack/ai-openrouter': patch +'@tanstack/ai-event-client': patch +--- + +Add `imageInputs`, `videoInputs`, and `audioInputs` to `generateImage()` and `generateVideo()` for image-conditioned generation, image-to-image, multi-reference, image-to-video, and edit / inpaint flows. Each input part may carry a `metadata.role` hint (`'reference' | 'mask' | 'control' | 'start_frame' | 'end_frame' | 'character'`) that adapters use to route to the provider-specific field. + +Provider behavior in this release: + +- **OpenAI image** — `gpt-image-1` / `gpt-image-1-mini` route to `images.edit()` (up to 16 source images plus optional mask); `dall-e-2` routes to `images.edit()` with one source image; `dall-e-3` throws a clear not-supported error. +- **OpenAI video** — Sora-2 / Sora-2-Pro accept a single `input_reference` image; passing more than one throws. +- **Gemini image** — Native models (`gemini-*-flash-image`, "nano-banana") receive inputs as multimodal parts in `contents`. Imagen throws (text-only). +- **fal.ai** — Inputs map to fal field names: single → `image_url`, multiple → `image_urls`; `role: 'mask'` → `mask_url`; `role: 'control'` → `control_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`. Video adapter additionally honors `role: 'start_frame'` / `'end_frame'`. +- **Grok**, **OpenRouter** — Throw with a link to issue #618 (full support pending dedicated Imagine / multimodal injection work). +- **Anthropic** — Unchanged (no image generation API). + +Closes #618. diff --git a/docs/media/image-generation.md b/docs/media/image-generation.md index bb03d6363..1a6481da9 100644 --- a/docs/media/image-generation.md +++ b/docs/media/image-generation.md @@ -79,6 +79,9 @@ All image adapters support these common options: | `prompt` | `string` | Text description of the image to generate (required) | | `numberOfImages` | `number` | Number of images to generate | | `size` | `string` | Size of the generated image in WIDTHxHEIGHT format | +| `imageInputs?` | `ImagePart[]` | Image conditioning inputs for image-to-image, reference-guided, edit, or multi-reference generation. See [Image-Conditioned Generation](#image-conditioned-generation) below. | +| `videoInputs?` | `VideoPart[]` | Video conditioning inputs. Provider support is limited; most adapters throw. | +| `audioInputs?` | `AudioPart[]` | Audio conditioning inputs. Provider support is limited; most adapters throw. | | `modelOptions?` | `object` | Model-specific options (renamed from `providerOptions`) | ### Size Options @@ -130,6 +133,114 @@ const result = await generateImage({ }) ``` +## Image-Conditioned Generation + +`generateImage()` accepts an optional `imageInputs` field for image-to-image, +reference-guided, multi-reference, and edit / inpaint flows. The field reuses +the same `ImagePart` shape used elsewhere for multimodal content: + +```typescript +import { generateImage, type ImagePart } from '@tanstack/ai' +import { openaiImage } from '@tanstack/ai-openai' + +const reference: ImagePart = { + type: 'image', + source: { type: 'url', value: 'https://example.com/product.png' }, +} + +await generateImage({ + adapter: openaiImage('gpt-image-1'), + prompt: 'Turn this into a cinematic product photo', + imageInputs: [reference], +}) +``` + +### Source format + +`ImagePart.source` is a discriminated union supporting both URLs and inline +base64 data — pass whichever you have: + +```typescript +// URL source +{ type: 'image', source: { type: 'url', value: 'https://example.com/img.png' } } + +// Inline base64 data (mimeType required) +{ type: 'image', source: { type: 'data', value: base64String, mimeType: 'image/png' } } +``` + +OpenAI's edit endpoint requires file uploads; the adapter fetches URL sources +and converts base64 to a `File` automatically. + +### Role hints via `metadata.role` + +When a generation has multiple inputs with different roles (mask vs reference +vs start/end frame), set `metadata.role` on each part. Adapters route by role +to the provider-specific field; parts without a role fall back to positional +mapping. + +| Role | Maps to | +| --------------- | -------------------------------------------------------------------------------------- | +| `'reference'` | fal `reference_image_urls`; Gemini multimodal part; positional fallback | +| `'character'` | Same as `'reference'`; Veo `referenceImages` slot | +| `'mask'` | OpenAI `mask` (gpt-image-1, dall-e-2); fal `mask_url` | +| `'control'` | fal `control_image_url` (ControlNet / depth / pose conditioning) | +| `'start_frame'` | fal `start_image_url`; Veo `image` (used by `generateVideo`) | +| `'end_frame'` | fal `end_image_url`; Veo `lastFrame` (used by `generateVideo`) | + +#### Inpaint / edit with a mask + +```typescript +await generateImage({ + adapter: openaiImage('gpt-image-1'), + prompt: 'Replace the masked region with a tree', + imageInputs: [ + { + type: 'image', + source: { type: 'url', value: photoUrl }, + }, + { + type: 'image', + source: { type: 'url', value: maskUrl }, + metadata: { role: 'mask' }, + }, + ], +}) +``` + +#### Multi-reference composition + +```typescript +const product: ImagePart = { + type: 'image', + source: { type: 'url', value: 'https://example.com/product.png' }, +} + +const style: ImagePart = { + type: 'image', + source: { type: 'url', value: 'https://example.com/style.png' }, +} + +await generateImage({ + adapter: geminiImage('gemini-3.1-flash-image-preview'), + prompt: 'Generate a new image of the product using the style of the second reference', + imageInputs: [product, style], +}) +``` + +### Provider support + +| Provider | Behavior | +| ------------ | --------------------------------------------------------------------------------------------------------- | +| **OpenAI** | `gpt-image-1` / `gpt-image-1-mini` → routes to `images.edit()`, up to 16 source images plus optional mask.
`dall-e-2` → `images.edit()` with 1 source image only.
`dall-e-3` → throws (no edit support). | +| **Gemini** | Native models (`gemini-*-flash-image`, "nano-banana", etc.) → inputs become multimodal parts in `contents`. Up to ~14 input images.
Imagen models → throws (text-to-image only). | +| **fal.ai** | 1 input → `image_url`; multiple → `image_urls`. `role: 'mask'` → `mask_url`. `role: 'control'` → `control_image_url`. `role: 'reference'` / `'character'` → `reference_image_urls`. Override with `modelOptions` for endpoint-specific fields. | +| **Grok** | Throws — the current adapter wraps Grok's OpenAI-compat endpoint, which doesn't expose image inputs. xAI's native Imagine API support is tracked as a follow-up. | +| **OpenRouter** | Throws — multimodal injection into the chat-completions pathway is tracked as a follow-up. | +| **Anthropic** | n/a — no image generation API. | + +Adapters that don't support image-conditioned generation throw a clear +runtime error so calls fail fast rather than silently dropping the inputs. + ## Model Options ### OpenAI Model Options diff --git a/docs/media/video-generation.md b/docs/media/video-generation.md index ae325f95b..59ad8ec24 100644 --- a/docs/media/video-generation.md +++ b/docs/media/video-generation.md @@ -366,8 +366,76 @@ And returns: | `prompt` | `string` | Text description of the video to generate (required) | | `size` | `string` | Video resolution in WIDTHxHEIGHT format | | `duration` | `number` | Video duration in seconds (maps to `seconds` parameter in API) | +| `imageInputs?` | `ImagePart[]` | Image conditioning inputs — starting frame, end frame, character / reference images. See [Image-to-Video](#image-to-video) below. | +| `videoInputs?` | `VideoPart[]` | Video conditioning inputs for video-to-video / source clip flows. Provider support varies. | +| `audioInputs?` | `AudioPart[]` | Audio conditioning inputs for lipsync / voice cloning flows. Provider support varies. | | `modelOptions?` | `object` | Model-specific options (renamed from `providerOptions`) | +## Image-to-Video + +`generateVideo()` accepts `imageInputs` for starting-frame, ending-frame, +and reference-image conditioned video generation: + +```typescript +import { generateVideo, type ImagePart } from '@tanstack/ai' +import { openaiVideo } from '@tanstack/ai-openai' + +const startingFrame: ImagePart = { + type: 'image', + source: { + type: 'data', + value: base64Image, + mimeType: 'image/png', + }, +} + +const { jobId } = await generateVideo({ + adapter: openaiVideo('sora-2'), + prompt: 'Animate this still into a slow cinematic push-in with subtle motion', + imageInputs: [startingFrame], +}) +``` + +### Role hints + +Each `ImagePart` can carry an optional `metadata.role` hint that the +adapter uses to route the input to the provider-specific field: + +| Role | Maps to | +| --------------- | ------------------------------------------------------------- | +| `'start_frame'` | fal `start_image_url` (positional default for the first input) | +| `'end_frame'` | fal `end_image_url` (Veo `lastFrame` when available) | +| `'reference'` | fal `reference_image_urls` (Veo `referenceImages`) | +| `'character'` | Same as `'reference'` — character consistency images | + +```typescript +import { falVideo } from '@tanstack/ai-fal' + +await generateVideo({ + adapter: falVideo('fal-ai/kling-video/v3/pro/image-to-video'), + prompt: 'Slow cinematic push-in then a hard cut', + imageInputs: [ + { type: 'image', source: { type: 'url', value: firstFrameUrl } }, + { + type: 'image', + source: { type: 'url', value: lastFrameUrl }, + metadata: { role: 'end_frame' }, + }, + ], +}) +``` + +### Provider support + +| Provider | Image-to-Video Behavior | +| ------------ | -------------------------------------------------------------------------------------------------------- | +| **OpenAI** | Sora-2 / Sora-2-Pro → first input goes to `input_reference`. Single image only — throws if more than one. | +| **fal.ai** | Single input → `image_url` (start frame). `role: 'end_frame'` → `end_image_url`. `role: 'start_frame'` → `start_image_url`. `role: 'reference'` / `'character'` → `reference_image_urls`. Override per-endpoint via `modelOptions`. | +| **Gemini** | Veo adapter not yet implemented — `imageInputs` will be supported when Veo lands. | + +Adapters whose underlying API can't accept image inputs throw a clear +runtime error so calls fail fast. + ### Supported Sizes Based on [OpenAI API docs](https://platform.openai.com/docs/api-reference/videos/create): diff --git a/packages/ai-event-client/src/index.ts b/packages/ai-event-client/src/index.ts index ede5544ee..0bd1fdc12 100644 --- a/packages/ai-event-client/src/index.ts +++ b/packages/ai-event-client/src/index.ts @@ -619,6 +619,12 @@ export interface ImageRequestStartedEvent extends BaseEventContext { prompt: string numberOfImages?: number size?: string + /** Count of image conditioning inputs (image-to-image, mask, reference). */ + imageInputCount?: number + /** Count of video conditioning inputs (video-to-video). */ + videoInputCount?: number + /** Count of audio conditioning inputs (lipsync, voice reference). */ + audioInputCount?: number } /** Emitted when an image request completes. */ diff --git a/packages/ai-fal/src/adapters/image.ts b/packages/ai-fal/src/adapters/image.ts index 503affefb..54a2134d1 100644 --- a/packages/ai-fal/src/adapters/image.ts +++ b/packages/ai-fal/src/adapters/image.ts @@ -7,6 +7,7 @@ import { generateId as utilGenerateId, } from '../utils' import { mapSizeToFalFormat } from '../image/image-provider-options' +import { mapImageInputsToFalFields } from '../image/image-inputs' import type { OutputType, Result } from '@fal-ai/client' import type { FalClientConfig } from '../utils' import type { @@ -68,6 +69,17 @@ export class FalImageAdapter extends BaseImageAdapter< model: this.model, }) + if (options.videoInputs?.length) { + throw new Error( + `fal.generateImages does not support videoInputs on model ${this.model}.`, + ) + } + if (options.audioInputs?.length) { + throw new Error( + `fal.generateImages does not support audioInputs on model ${this.model}.`, + ) + } + try { const input = this.buildInput(options) const result = await fal.subscribe(this.model, { input }) @@ -88,9 +100,14 @@ export class FalImageAdapter extends BaseImageAdapter< >, ): FalModelInput { const sizeParams = mapSizeToFalFormat(options.size) + // Order matters: modelOptions first (so user overrides win for + // mask_url / control_image_url / reference_image_urls), then size, + // then derived image-input fields, then prompt / num_images. + const inputFields = mapImageInputsToFalFields(options.imageInputs) const input = { ...options.modelOptions, ...sizeParams, + ...inputFields, prompt: options.prompt, num_images: options.numberOfImages, } as FalModelInput diff --git a/packages/ai-fal/src/adapters/video.ts b/packages/ai-fal/src/adapters/video.ts index 05b006069..db6c7e7e0 100644 --- a/packages/ai-fal/src/adapters/video.ts +++ b/packages/ai-fal/src/adapters/video.ts @@ -7,6 +7,8 @@ import { generateId as utilGenerateId, } from '../utils' import { mapVideoSizeToFalFormat } from '../video/video-provider-options' +import { mapImageInputsToFalVideoFields } from '../image/image-inputs' +import type { AudioPart, MediaInputMetadata, VideoPart } from '@tanstack/ai' import type { VideoGenerationOptions, VideoJobResult, @@ -21,6 +23,60 @@ import type { } from '../model-meta' import type { FalClientConfig } from '../utils' +/** + * Map video conditioning inputs onto fal field names. + * Video-to-video endpoints on fal almost universally use `video_url`; the + * occasional model takes `video_urls` (rare). Mirror the image-input logic + * positionally with a `reference` role escape hatch via `reference_video_urls`. + */ +function mapVideoInputsToFalFields( + videoInputs?: ReadonlyArray>, +): Record { + if (!videoInputs || videoInputs.length === 0) return {} + const references: Array = [] + const sources: Array = [] + for (const part of videoInputs) { + const url = videoPartToUrl(part) + if (part.metadata?.role === 'reference' || part.metadata?.role === 'character') { + references.push(url) + } else { + sources.push(url) + } + } + const out: Record = {} + if (references.length > 0) out.reference_video_urls = references + if (sources.length === 1) { + out.video_url = sources[0] + } else if (sources.length > 1) { + out.video_urls = sources + } + return out +} + +function mapAudioInputsToFalFields( + audioInputs?: ReadonlyArray>, +): Record { + if (!audioInputs || audioInputs.length === 0) return {} + if (audioInputs.length > 1) { + throw new Error( + `fal: multiple audioInputs are not supported (received ${audioInputs.length}).`, + ) + } + const part = audioInputs[0]! + return { + audio_url: + part.source.type === 'url' + ? part.source.value + : `data:${part.source.mimeType};base64,${part.source.value}`, + } +} + +function videoPartToUrl(part: VideoPart): string { + return part.source.type === 'url' + ? part.source.value + : `data:${part.source.mimeType};base64,${part.source.value}` +} + type FalQueueStatus = 'IN_QUEUE' | 'IN_PROGRESS' | 'COMPLETED' interface FalStatusResponse { @@ -85,7 +141,16 @@ export class FalVideoAdapter extends BaseVideoAdapter< FalModelVideoSize >, ): Promise { - const { prompt, size, duration, modelOptions, logger } = options + const { + prompt, + size, + duration, + modelOptions, + logger, + imageInputs, + videoInputs, + audioInputs, + } = options logger.request(`activity=generateVideo provider=fal model=${this.model}`, { provider: 'fal', @@ -94,10 +159,16 @@ export class FalVideoAdapter extends BaseVideoAdapter< try { const sizeParams = mapVideoSizeToFalFormat(size) + const inputImageFields = mapImageInputsToFalVideoFields(imageInputs) + const videoFields = mapVideoInputsToFalFields(videoInputs) + const audioFields = mapAudioInputsToFalFields(audioInputs) const input = { ...modelOptions, ...sizeParams, + ...inputImageFields, + ...videoFields, + ...audioFields, prompt, ...(duration ? { duration } : {}), } as FalModelInput diff --git a/packages/ai-fal/src/image/image-inputs.ts b/packages/ai-fal/src/image/image-inputs.ts new file mode 100644 index 000000000..0a5a06ca3 --- /dev/null +++ b/packages/ai-fal/src/image/image-inputs.ts @@ -0,0 +1,165 @@ +import type { ImagePart, MediaInputMetadata } from '@tanstack/ai' + +/** + * Map TanStack `imageInputs` onto fal.ai endpoint fields. + * + * fal endpoints use different field names for image-conditioned generation + * (~80% use `image_url` for single; the rest use `image_urls`, + * `reference_image_urls`, `mask_url`, `control_image_url`, etc.). Without + * per-endpoint metadata we apply this heuristic: + * + * - parts with `metadata.role === 'mask'` → `mask_url` (single) + * - parts with `metadata.role === 'control'` → `control_image_url` (single) + * - parts with `metadata.role === 'reference'` → `reference_image_urls` (array) + * - parts with `metadata.role === 'character'` → `reference_image_urls` (array) + * - remaining parts (no role, or unknown role): + * - exactly 1 part → `image_url` + * - >1 parts → `image_urls` + * + * Users can always override the resulting field shape via `modelOptions` + * (spread before these fields), or pass everything through `modelOptions` + * directly when the heuristic doesn't match an obscure endpoint. + * + * This mapping is interim and will be replaced by a per-endpoint mapping + * sourced from the `@fal-ai/schemas` library once it lands. + */ +export function mapImageInputsToFalFields( + imageInputs?: ReadonlyArray>, +): Record { + if (!imageInputs || imageInputs.length === 0) return {} + + const fields: Record = {} + + const masks: Array = [] + const controls: Array = [] + const references: Array = [] + const sources: Array = [] + + for (const part of imageInputs) { + const url = imagePartToUrl(part) + const role = part.metadata?.role + switch (role) { + case 'mask': + masks.push(url) + break + case 'control': + controls.push(url) + break + case 'reference': + case 'character': + references.push(url) + break + case 'start_frame': + case 'end_frame': + // Frame roles aren't meaningful for image generation; treat as the + // primary source. Video adapter handles start/end framing. + sources.push(url) + break + default: + sources.push(url) + } + } + + if (masks.length > 1) { + throw new Error( + `fal: only one input with metadata.role === 'mask' is supported per request (received ${masks.length}).`, + ) + } + if (controls.length > 1) { + throw new Error( + `fal: only one input with metadata.role === 'control' is supported per request (received ${controls.length}).`, + ) + } + + if (masks[0]) fields.mask_url = masks[0] + if (controls[0]) fields.control_image_url = controls[0] + if (references.length > 0) fields.reference_image_urls = references + + if (sources.length === 1) { + fields.image_url = sources[0] + } else if (sources.length > 1) { + fields.image_urls = sources + } + + return fields +} + +/** + * Map TanStack `imageInputs` onto fal.ai video-endpoint fields. + * + * Video endpoints often expose a start frame as `image_url` (76% of i2v + * models) plus an optional `end_image_url`. Multi-reference video models + * (Kling O3, Seedance reference-to-video) use `reference_image_urls` or + * `image_urls`. Mapping: + * + * - `metadata.role === 'start_frame'` → `start_image_url` + * - `metadata.role === 'end_frame'` → `end_image_url` + * - `metadata.role === 'reference' | 'character'` → `reference_image_urls` + * - remaining parts (no role or unknown role): + * - exactly 1 part → `image_url` + * - >1 parts → `image_urls` + */ +export function mapImageInputsToFalVideoFields( + imageInputs?: ReadonlyArray>, +): Record { + if (!imageInputs || imageInputs.length === 0) return {} + + const fields: Record = {} + + const startFrames: Array = [] + const endFrames: Array = [] + const references: Array = [] + const sources: Array = [] + + for (const part of imageInputs) { + const url = imagePartToUrl(part) + const role = part.metadata?.role + switch (role) { + case 'start_frame': + startFrames.push(url) + break + case 'end_frame': + endFrames.push(url) + break + case 'reference': + case 'character': + references.push(url) + break + default: + sources.push(url) + } + } + + if (startFrames.length > 1) { + throw new Error( + `fal: only one input with metadata.role === 'start_frame' is supported (received ${startFrames.length}).`, + ) + } + if (endFrames.length > 1) { + throw new Error( + `fal: only one input with metadata.role === 'end_frame' is supported (received ${endFrames.length}).`, + ) + } + + if (startFrames[0]) fields.start_image_url = startFrames[0] + if (endFrames[0]) fields.end_image_url = endFrames[0] + if (references.length > 0) fields.reference_image_urls = references + + if (sources.length === 1) { + fields.image_url = sources[0] + } else if (sources.length > 1) { + fields.image_urls = sources + } + + return fields +} + +/** + * Convert a TanStack ImagePart into a string suitable for fal's URL-based + * input fields. URL sources pass through; data sources are emitted as a + * `data:;base64,` URI which fal endpoints accept on the wire. + */ +function imagePartToUrl(part: ImagePart): string { + if (part.source.type === 'url') return part.source.value + return `data:${part.source.mimeType};base64,${part.source.value}` +} diff --git a/packages/ai-fal/tests/image-inputs.test.ts b/packages/ai-fal/tests/image-inputs.test.ts new file mode 100644 index 000000000..0ed534080 --- /dev/null +++ b/packages/ai-fal/tests/image-inputs.test.ts @@ -0,0 +1,140 @@ +import { describe, expect, it } from 'vitest' +import { + mapImageInputsToFalFields, + mapImageInputsToFalVideoFields, +} from '../src/image/image-inputs' +import type { ImagePart, MediaInputMetadata } from '@tanstack/ai' + +function urlPart( + value: string, + metadata?: MediaInputMetadata, +): ImagePart { + return { + type: 'image', + source: { type: 'url', value }, + ...(metadata && { metadata }), + } +} + +describe('mapImageInputsToFalFields', () => { + it('returns an empty object when imageInputs is missing or empty', () => { + expect(mapImageInputsToFalFields(undefined)).toEqual({}) + expect(mapImageInputsToFalFields([])).toEqual({}) + }) + + it('routes a single source to image_url', () => { + expect( + mapImageInputsToFalFields([urlPart('https://example.com/a.png')]), + ).toEqual({ image_url: 'https://example.com/a.png' }) + }) + + it('routes multiple sources to image_urls', () => { + expect( + mapImageInputsToFalFields([ + urlPart('https://example.com/a.png'), + urlPart('https://example.com/b.png'), + ]), + ).toEqual({ + image_urls: ['https://example.com/a.png', 'https://example.com/b.png'], + }) + }) + + it('routes role=mask to mask_url alongside the source image_url', () => { + expect( + mapImageInputsToFalFields([ + urlPart('https://example.com/img.png'), + urlPart('https://example.com/mask.png', { role: 'mask' }), + ]), + ).toEqual({ + image_url: 'https://example.com/img.png', + mask_url: 'https://example.com/mask.png', + }) + }) + + it('routes role=reference to reference_image_urls', () => { + expect( + mapImageInputsToFalFields([ + urlPart('https://example.com/product.png'), + urlPart('https://example.com/style.png', { role: 'reference' }), + urlPart('https://example.com/character.png', { role: 'character' }), + ]), + ).toEqual({ + image_url: 'https://example.com/product.png', + reference_image_urls: [ + 'https://example.com/style.png', + 'https://example.com/character.png', + ], + }) + }) + + it('routes role=control to control_image_url', () => { + expect( + mapImageInputsToFalFields([ + urlPart('https://example.com/img.png'), + urlPart('https://example.com/depth.png', { role: 'control' }), + ]), + ).toEqual({ + image_url: 'https://example.com/img.png', + control_image_url: 'https://example.com/depth.png', + }) + }) + + it('encodes data sources as data URIs', () => { + expect( + mapImageInputsToFalFields([ + { + type: 'image', + source: { type: 'data', value: 'aGVsbG8=', mimeType: 'image/png' }, + }, + ]), + ).toEqual({ image_url: 'data:image/png;base64,aGVsbG8=' }) + }) + + it('throws when more than one mask is provided', () => { + expect(() => + mapImageInputsToFalFields([ + urlPart('https://example.com/m1.png', { role: 'mask' }), + urlPart('https://example.com/m2.png', { role: 'mask' }), + ]), + ).toThrow(/only one input with metadata.role === 'mask'/) + }) +}) + +describe('mapImageInputsToFalVideoFields', () => { + it('returns empty for missing/empty inputs', () => { + expect(mapImageInputsToFalVideoFields(undefined)).toEqual({}) + expect(mapImageInputsToFalVideoFields([])).toEqual({}) + }) + + it('routes a single positional source to image_url (start frame)', () => { + expect( + mapImageInputsToFalVideoFields([ + urlPart('https://example.com/start.png'), + ]), + ).toEqual({ image_url: 'https://example.com/start.png' }) + }) + + it('routes role=start_frame to start_image_url and role=end_frame to end_image_url', () => { + expect( + mapImageInputsToFalVideoFields([ + urlPart('https://example.com/a.png', { role: 'start_frame' }), + urlPart('https://example.com/z.png', { role: 'end_frame' }), + ]), + ).toEqual({ + start_image_url: 'https://example.com/a.png', + end_image_url: 'https://example.com/z.png', + }) + }) + + it('routes role=reference to reference_image_urls', () => { + expect( + mapImageInputsToFalVideoFields([ + urlPart('https://example.com/start.png'), + urlPart('https://example.com/character.png', { role: 'reference' }), + ]), + ).toEqual({ + image_url: 'https://example.com/start.png', + reference_image_urls: ['https://example.com/character.png'], + }) + }) +}) diff --git a/packages/ai-gemini/src/adapters/image.ts b/packages/ai-gemini/src/adapters/image.ts index 35d32b857..dfaffd23f 100644 --- a/packages/ai-gemini/src/adapters/image.ts +++ b/packages/ai-gemini/src/adapters/image.ts @@ -1,4 +1,5 @@ import { BaseImageAdapter } from '@tanstack/ai/adapters' +import { arrayBufferToBase64 } from '@tanstack/ai-utils' import { createGeminiClient, generateId, @@ -22,13 +23,17 @@ import type { GeneratedImage, ImageGenerationOptions, ImageGenerationResult, + ImagePart, + MediaInputMetadata, } from '@tanstack/ai' import type { + Content, GenerateContentConfig, GenerateContentResponse, GenerateImagesConfig, GenerateImagesResponse, GoogleGenAI, + Part, } from '@google/genai' import type { GeminiClientConfig } from '../utils' @@ -95,10 +100,29 @@ export class GeminiImageAdapter< try { validatePrompt({ prompt, model }) + if (options.videoInputs?.length) { + throw new Error( + `${this.name}.generateImages does not support videoInputs (model: ${model}).`, + ) + } + if (options.audioInputs?.length) { + throw new Error( + `${this.name}.generateImages does not support audioInputs (model: ${model}).`, + ) + } + if (this.isGeminiImageModel(model)) { return await this.generateWithGeminiApi(options) } + // Imagen does not accept image inputs — it's strictly text-to-image. + if (options.imageInputs?.length) { + throw new Error( + `${this.name}: model "${model}" (Imagen) does not support imageInputs. ` + + `Use a Gemini-native image model (e.g. gemini-2.5-flash-image, "nano-banana") for image-conditioned generation.`, + ) + } + // Imagen models path (generateImages API) validateImageSize(model, options.size) validateNumberOfImages(model, options.numberOfImages) @@ -128,7 +152,8 @@ export class GeminiImageAdapter< private async generateWithGeminiApi( options: ImageGenerationOptions, ): Promise { - const { model, prompt, size, numberOfImages, modelOptions } = options + const { model, prompt, size, numberOfImages, modelOptions, imageInputs } = + options const parsedSize = size ? parseNativeImageSize(size) : undefined @@ -170,15 +195,76 @@ export class GeminiImageAdapter< }), } + const contents = await this.buildContents(augmentedPrompt, imageInputs) + const response = await this.client.models.generateContent({ model, - contents: augmentedPrompt, + contents, config, }) return this.transformGeminiResponse(model, response) } + /** + * Build the multimodal `contents` payload. When `imageInputs` is empty the + * SDK accepts a plain prompt string; with inputs we hand it a single user + * `Content` whose `parts` interleave the inline/file image data with the + * text prompt last (Gemini conventionally treats the trailing text as the + * instruction). + */ + private async buildContents( + prompt: string, + imageInputs?: ReadonlyArray>, + ): Promise> { + if (!imageInputs || imageInputs.length === 0) { + return prompt + } + const imageParts: Array = await Promise.all( + imageInputs.map((part) => this.imagePartToGeminiPart(part)), + ) + const parts: Array = [...imageParts, { text: prompt }] + return [{ role: 'user', parts }] + } + + private async imagePartToGeminiPart( + part: ImagePart, + ): Promise { + if (part.source.type === 'data') { + return { + inlineData: { + mimeType: part.source.mimeType || 'image/png', + data: part.source.value, + }, + } + } + // For URL sources, prefer passing the URL through as `fileData` when it + // looks like a Google Files API URI; otherwise fetch and inline as base64. + if (part.source.value.startsWith('gs://') || /^https?:\/\/generativelanguage\.googleapis\.com\//.test(part.source.value)) { + return { + fileData: { + fileUri: part.source.value, + ...(part.source.mimeType && { mimeType: part.source.mimeType }), + }, + } + } + const response = await fetch(part.source.value) + if (!response.ok) { + throw new Error( + `Failed to fetch image input (${response.status} ${response.statusText}): ${part.source.value}`, + ) + } + const blob = await response.blob() + const buffer = await blob.arrayBuffer() + const base64 = arrayBufferToBase64(buffer) + return { + inlineData: { + mimeType: part.source.mimeType || blob.type || 'image/png', + data: base64, + }, + } + } + private transformGeminiResponse( model: string, response: GenerateContentResponse, diff --git a/packages/ai-grok/src/adapters/image.ts b/packages/ai-grok/src/adapters/image.ts index 6e88d815e..e5241103b 100644 --- a/packages/ai-grok/src/adapters/image.ts +++ b/packages/ai-grok/src/adapters/image.ts @@ -62,6 +62,18 @@ export class GrokImageAdapter< ): Promise { const { model, prompt, numberOfImages, size, modelOptions } = options + if ( + options.imageInputs?.length || + options.videoInputs?.length || + options.audioInputs?.length + ) { + throw new Error( + `grok.generateImages does not yet support imageInputs / videoInputs / audioInputs. ` + + `Image-conditioned generation requires the xAI Imagine API, which the current adapter ` + + `does not target (it uses the OpenAI-compat endpoint). Track progress at https://github.com/TanStack/ai/issues/618.`, + ) + } + validatePrompt({ prompt, model }) validateImageSize(model, size) validateNumberOfImages(model, numberOfImages) diff --git a/packages/ai-openai/src/adapters/image.ts b/packages/ai-openai/src/adapters/image.ts index 8e980b3d1..1a488cf39 100644 --- a/packages/ai-openai/src/adapters/image.ts +++ b/packages/ai-openai/src/adapters/image.ts @@ -4,6 +4,7 @@ import { toRunErrorPayload } from '@tanstack/ai/adapter-internals' import { buildImagesUsage } from '@tanstack/openai-base' import { generateId } from '@tanstack/ai-utils' import { getOpenAIApiKeyFromEnv } from '../utils/client' +import { imagePartToFile } from '../image/image-input-to-file' import { validateImageSize, validateNumberOfImages, @@ -13,6 +14,8 @@ import type { GeneratedImage, ImageGenerationOptions, ImageGenerationResult, + ImagePart, + MediaInputMetadata, } from '@tanstack/ai' import type OpenAI_SDK from 'openai' import type { OpenAIImageModel } from '../model-meta' @@ -23,6 +26,15 @@ import type { } from '../image/image-provider-options' import type { OpenAIClientConfig } from '../utils/client' +// Per OpenAI docs: dall-e-2 accepts 1 image to `images.edit()`; gpt-image-1 +// and gpt-image-1-mini accept up to 16; dall-e-3 does not support edit at all. +const EDIT_MAX_IMAGES: Record = { + 'dall-e-2': 1, + 'gpt-image-1': 16, + 'gpt-image-1-mini': 16, + 'dall-e-3': 0, +} + /** * Configuration for OpenAI image adapter */ @@ -60,12 +72,44 @@ export class OpenAIImageAdapter< async generateImages( options: ImageGenerationOptions, ): Promise { - const { model, prompt, numberOfImages, size, modelOptions } = options + const { + model, + prompt, + numberOfImages, + size, + modelOptions, + imageInputs, + videoInputs, + audioInputs, + } = options validatePrompt({ prompt, model }) validateImageSize(model, size) validateNumberOfImages(model, numberOfImages) + if (videoInputs?.length) { + throw new Error( + `${this.name}.generateImages does not support videoInputs (model: ${model}).`, + ) + } + if (audioInputs?.length) { + throw new Error( + `${this.name}.generateImages does not support audioInputs (model: ${model}).`, + ) + } + + if (imageInputs && imageInputs.length > 0) { + return this.editImages({ + model: model as OpenAIImageModel, + prompt, + numberOfImages, + size, + modelOptions, + imageInputs, + logger: options.logger, + }) + } + // With exactOptionalPropertyTypes, vendor SDK request shapes reject // `T | undefined` in optional fields. Build the request incrementally and // only set `size` when it's actually defined. @@ -128,6 +172,128 @@ export class OpenAIImageAdapter< throw error } } + + /** + * Image-conditioned generation via OpenAI's `images.edit()` endpoint. + * dall-e-2 accepts 1 input image; gpt-image-1 / gpt-image-1-mini accept up + * to 16; dall-e-3 rejects entirely. A part with `metadata.role === 'mask'` + * is routed to the SDK's `mask` field (PNG with alpha channel). + */ + private async editImages(args: { + model: OpenAIImageModel + prompt: string + numberOfImages?: number + size?: string + modelOptions?: OpenAIImageProviderOptions + imageInputs: ReadonlyArray> + logger: ImageGenerationOptions['logger'] + }): Promise { + const { model, prompt, numberOfImages, size, modelOptions, logger } = args + const maxImages = EDIT_MAX_IMAGES[model] + if (maxImages === 0) { + throw new Error( + `${this.name}: model "${model}" does not support imageInputs. ` + + `Use gpt-image-1, gpt-image-1-mini, or dall-e-2 for image-conditioned generation.`, + ) + } + + const maskParts = args.imageInputs.filter( + (part) => part.metadata?.role === 'mask', + ) + const sourceParts = args.imageInputs.filter( + (part) => part.metadata?.role !== 'mask', + ) + + if (maskParts.length > 1) { + throw new Error( + `${this.name}: only one input with metadata.role === 'mask' is supported per request.`, + ) + } + if (sourceParts.length === 0) { + throw new Error( + `${this.name}: imageInputs contained only mask parts; at least one source image is required.`, + ) + } + if (sourceParts.length > maxImages) { + throw new Error( + `${this.name}: model "${model}" accepts at most ${maxImages} source image(s); received ${sourceParts.length}.`, + ) + } + + const sourceFiles = await Promise.all( + sourceParts.map((part, i) => imagePartToFile(part, `source-${i}`)), + ) + const maskFile = maskParts[0] + ? await imagePartToFile(maskParts[0], 'mask') + : undefined + + // `modelOptions` is typed across all four image models (including dall-e-3's + // `quality: 'hd' | 'standard'` which isn't valid for edit). dall-e-3 has + // already been rejected above, so any remaining quality value is valid for + // the edit endpoint — cast the spread to clear the union mismatch. + const request: OpenAI_SDK.Images.ImageEditParamsNonStreaming = { + model, + prompt, + image: sourceFiles.length === 1 ? sourceFiles[0]! : sourceFiles, + n: numberOfImages ?? 1, + stream: false, + ...((modelOptions ?? {}) as Partial), + } + if (size !== undefined) { + request.size = size as Exclude< + OpenAI_SDK.Images.ImageEditParamsNonStreaming['size'], + undefined + > + } + if (maskFile) { + request.mask = maskFile + } + + try { + logger.request( + `activity=imageEdit provider=${this.name} model=${model} n=${request.n ?? 1} size=${request.size ?? 'default'} sources=${sourceFiles.length}${maskFile ? ' mask' : ''}`, + { provider: this.name, model }, + ) + const response = await this.client.images.edit(request) + + const images: Array = (response.data ?? []).flatMap( + (item): Array => { + const revisedPromptField = + item.revised_prompt !== undefined + ? { revisedPrompt: item.revised_prompt } + : {} + if (item.b64_json) { + return [{ b64Json: item.b64_json, ...revisedPromptField }] + } + if (item.url) { + return [{ url: item.url, ...revisedPromptField }] + } + return [] + }, + ) + + return { + id: generateId(this.name), + model, + images, + ...(response.usage + ? { + usage: { + inputTokens: response.usage.input_tokens, + outputTokens: response.usage.output_tokens, + totalTokens: response.usage.total_tokens, + }, + } + : {}), + } + } catch (error: unknown) { + logger.errors(`${this.name}.editImages fatal`, { + error: toRunErrorPayload(error, `${this.name}.editImages failed`), + source: `${this.name}.editImages`, + }) + throw error + } + } } /** diff --git a/packages/ai-openai/src/adapters/video.ts b/packages/ai-openai/src/adapters/video.ts index 6bf4652f8..cfd596faf 100644 --- a/packages/ai-openai/src/adapters/video.ts +++ b/packages/ai-openai/src/adapters/video.ts @@ -3,6 +3,7 @@ import { BaseVideoAdapter } from '@tanstack/ai/adapters' import { toRunErrorPayload } from '@tanstack/ai/adapter-internals' import { arrayBufferToBase64 } from '@tanstack/ai-utils' import { getOpenAIApiKeyFromEnv } from '../utils/client' +import { imagePartToFile } from '../image/image-input-to-file' import { toApiSeconds, validateVideoSeconds, @@ -87,15 +88,38 @@ export class OpenAIVideoAdapter< options: VideoGenerationOptions, ): Promise { const { model, size, duration, modelOptions } = options + const { imageInputs, videoInputs, audioInputs } = options validateVideoSize(model, size) const seconds = duration ?? modelOptions?.seconds validateVideoSeconds(model, seconds) + if (videoInputs?.length) { + throw new Error( + `${this.name}.createVideoJob does not support videoInputs (model: ${model}).`, + ) + } + if (audioInputs?.length) { + throw new Error( + `${this.name}.createVideoJob does not support audioInputs (model: ${model}).`, + ) + } + if (imageInputs && imageInputs.length > 1) { + throw new Error( + `${this.name}: Sora accepts at most one input_reference image; received ${imageInputs.length}.`, + ) + } + const request: OpenAI_SDK.Videos.VideoCreateParams = { model, prompt: options.prompt, } + if (imageInputs && imageInputs[0]) { + // Sora's `input_reference` is a single Uploadable; convert TanStack + // ImagePart (URL or base64) → File before handing it to the SDK. + const file = await imagePartToFile(imageInputs[0], 'input-reference') + ;(request as { input_reference?: unknown }).input_reference = file + } // `VideoCreateParams.size` is `size?: VideoSize` (no `| undefined`), so we // narrow before assignment instead of casting from a `T | undefined` source. if (size) { diff --git a/packages/ai-openai/src/image/image-input-to-file.ts b/packages/ai-openai/src/image/image-input-to-file.ts new file mode 100644 index 000000000..2074496fd --- /dev/null +++ b/packages/ai-openai/src/image/image-input-to-file.ts @@ -0,0 +1,70 @@ +import { base64ToArrayBuffer } from '@tanstack/ai-utils' +import type { ImagePart, MediaInputMetadata } from '@tanstack/ai' + +const DEFAULT_MIME = 'image/png' +const MIME_TO_EXT: Record = { + 'image/png': 'png', + 'image/jpeg': 'jpg', + 'image/jpg': 'jpg', + 'image/webp': 'webp', + 'image/gif': 'gif', +} + +function extForMime(mimeType: string): string { + return MIME_TO_EXT[mimeType] ?? mimeType.split('/')[1] ?? 'png' +} + +function ensureFileSupport(): void { + if (typeof File === 'undefined') { + throw new Error( + '`File` is not available in this environment. ' + + 'Image-conditioned generation requires Node 20+ or a browser context.', + ) + } +} + +/** + * Convert a TanStack `ImagePart` into an OpenAI-compatible `File`. + * + * - `source.type === 'data'`: decode base64 → Buffer → File. + * - `source.type === 'url'`: fetch the URL (or parse data: URI) → File. + * + * The mime type comes from the source when available, else inferred from the + * URL extension, else `image/png`. + */ +export async function imagePartToFile( + part: ImagePart, + fallbackName: string, +): Promise { + ensureFileSupport() + + if (part.source.type === 'data') { + const mimeType = part.source.mimeType || DEFAULT_MIME + const bytes = base64ToArrayBuffer(part.source.value) + return new File([bytes], `${fallbackName}.${extForMime(mimeType)}`, { + type: mimeType, + }) + } + + // URL source — also handles data: URIs uniformly via fetch(). + const response = await fetch(part.source.value) + if (!response.ok) { + throw new Error( + `Failed to fetch image input (${response.status} ${response.statusText}): ${part.source.value}`, + ) + } + const blob = await response.blob() + const mimeType = + part.source.mimeType || blob.type || inferMimeFromUrl(part.source.value) + return new File([blob], `${fallbackName}.${extForMime(mimeType)}`, { + type: mimeType, + }) +} + +function inferMimeFromUrl(url: string): string { + const match = url.match(/\.(png|jpe?g|webp|gif)(?:\?|#|$)/i) + if (!match || !match[1]) return DEFAULT_MIME + const ext = match[1].toLowerCase() + if (ext === 'jpg' || ext === 'jpeg') return 'image/jpeg' + return `image/${ext}` +} diff --git a/packages/ai-openai/tests/image-adapter.test.ts b/packages/ai-openai/tests/image-adapter.test.ts index 6944c3e02..e0ee5dd32 100644 --- a/packages/ai-openai/tests/image-adapter.test.ts +++ b/packages/ai-openai/tests/image-adapter.test.ts @@ -25,6 +25,9 @@ class TestOpenAIImageAdapter< spyOnImagesGenerate() { return vi.spyOn(this.client.images, 'generate') } + spyOnImagesEdit() { + return vi.spyOn(this.client.images, 'edit') + } } describe('OpenAI Image Adapter', () => { @@ -234,4 +237,158 @@ describe('OpenAI Image Adapter', () => { expect(result2.id).toMatch(/^openai-/) }) }) + + describe('imageInputs (image-conditioned generation)', () => { + const imagesEditResponse: OpenAI.Images.ImagesResponse = { + created: 0, + data: [{ b64_json: 'edited-base64' }], + } + + it('routes to images.edit() for gpt-image-1 when imageInputs is present', async () => { + const adapter = new TestOpenAIImageAdapter( + { apiKey: 'test-api-key' }, + 'gpt-image-1', + ) + const editSpy = adapter + .spyOnImagesEdit() + .mockResolvedValueOnce(imagesEditResponse) + const generateSpy = adapter.spyOnImagesGenerate() + + const result = await adapter.generateImages({ + model: 'gpt-image-1', + prompt: 'Make it cinematic', + imageInputs: [ + { + type: 'image', + source: { + type: 'data', + value: 'aGVsbG8=', + mimeType: 'image/png', + }, + }, + ], + logger: testLogger, + }) + + expect(generateSpy).not.toHaveBeenCalled() + expect(editSpy).toHaveBeenCalledTimes(1) + const editArgs = editSpy.mock.calls[0]![0] + expect(editArgs.model).toBe('gpt-image-1') + expect(editArgs.prompt).toBe('Make it cinematic') + expect(editArgs.image).toBeInstanceOf(File) + expect(result.images[0]!.b64Json).toBe('edited-base64') + }) + + it('rejects dall-e-3 with a clear error when imageInputs is present', async () => { + const adapter = new TestOpenAIImageAdapter( + { apiKey: 'test-api-key' }, + 'dall-e-3', + ) + + await expect( + adapter.generateImages({ + model: 'dall-e-3', + prompt: 'edit', + imageInputs: [ + { + type: 'image', + source: { type: 'data', value: 'aGk=', mimeType: 'image/png' }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/does not support imageInputs/) + }) + + it('rejects dall-e-2 when more than one source image is provided', async () => { + const adapter = new TestOpenAIImageAdapter( + { apiKey: 'test-api-key' }, + 'dall-e-2', + ) + + await expect( + adapter.generateImages({ + model: 'dall-e-2', + prompt: 'edit', + imageInputs: [ + { + type: 'image', + source: { type: 'data', value: 'aGk=', mimeType: 'image/png' }, + }, + { + type: 'image', + source: { type: 'data', value: 'YnllCg==', mimeType: 'image/png' }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/at most 1 source image/) + }) + + it('routes metadata.role==="mask" to the mask param', async () => { + const adapter = new TestOpenAIImageAdapter( + { apiKey: 'test-api-key' }, + 'gpt-image-1', + ) + const editSpy = adapter + .spyOnImagesEdit() + .mockResolvedValueOnce(imagesEditResponse) + + await adapter.generateImages({ + model: 'gpt-image-1', + prompt: 'replace masked region', + imageInputs: [ + { + type: 'image', + source: { type: 'data', value: 'aGk=', mimeType: 'image/png' }, + }, + { + type: 'image', + source: { type: 'data', value: 'bWFzaw==', mimeType: 'image/png' }, + metadata: { role: 'mask' }, + }, + ], + logger: testLogger, + }) + + const editArgs = editSpy.mock.calls[0]![0] + expect(editArgs.mask).toBeInstanceOf(File) + expect(editArgs.image).toBeInstanceOf(File) + }) + + it('rejects videoInputs or audioInputs', async () => { + const adapter = new TestOpenAIImageAdapter( + { apiKey: 'test-api-key' }, + 'gpt-image-1', + ) + + await expect( + adapter.generateImages({ + model: 'gpt-image-1', + prompt: 'x', + videoInputs: [ + { + type: 'video', + source: { type: 'url', value: 'https://example.com/v.mp4' }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/videoInputs/) + + await expect( + adapter.generateImages({ + model: 'gpt-image-1', + prompt: 'x', + audioInputs: [ + { + type: 'audio', + source: { type: 'url', value: 'https://example.com/a.mp3' }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/audioInputs/) + }) + }) }) diff --git a/packages/ai-openrouter/src/adapters/image.ts b/packages/ai-openrouter/src/adapters/image.ts index 4384713f2..786b582bd 100644 --- a/packages/ai-openrouter/src/adapters/image.ts +++ b/packages/ai-openrouter/src/adapters/image.ts @@ -65,6 +65,18 @@ export class OpenRouterImageAdapter< async generateImages( options: ImageGenerationOptions, ): Promise { + if ( + options.imageInputs?.length || + options.videoInputs?.length || + options.audioInputs?.length + ) { + throw new Error( + `openrouter.generateImages does not yet support imageInputs / videoInputs / audioInputs. ` + + `Image-conditioned generation via OpenRouter requires injecting parts into the multimodal ` + + `chat-completions messages array; this is tracked at https://github.com/TanStack/ai/issues/618.`, + ) + } + const { model, prompt, numberOfImages, size, modelOptions, logger } = options // Use provided aspect_ratio or derive from size diff --git a/packages/ai/skills/ai-core/media-generation/SKILL.md b/packages/ai/skills/ai-core/media-generation/SKILL.md index b9c4c1a2c..fc3084d2e 100644 --- a/packages/ai/skills/ai-core/media-generation/SKILL.md +++ b/packages/ai/skills/ai-core/media-generation/SKILL.md @@ -189,6 +189,91 @@ Result shape: `ImageGenerationResult` with `images` array where each entry has `b64Json?`, `url?`, and `revisedPrompt?`. OpenAI image URLs expire after 1 hour -- download or display immediately. +#### Image-conditioned generation: `imageInputs` / `videoInputs` / `audioInputs` + +Both `generateImage()` and `generateVideo()` accept multimodal conditioning +inputs that reuse the existing `ImagePart` / `VideoPart` / `AudioPart` +shape used elsewhere in TanStack AI. Each input may carry an optional +`metadata.role` hint that adapters use to route the part to the +provider-specific field. + +```typescript +import { generateImage, type ImagePart } from '@tanstack/ai' +import { openaiImage } from '@tanstack/ai-openai' + +// Image-to-image (OpenAI gpt-image-1, dall-e-2) +await generateImage({ + adapter: openaiImage('gpt-image-1'), + prompt: 'Turn this into a cinematic product photo', + imageInputs: [ + { type: 'image', source: { type: 'url', value: 'https://…/product.png' } }, + ], +}) + +// Multi-reference (up to 16 for gpt-image-1; up to 14 for Gemini native) +await generateImage({ + adapter: openaiImage('gpt-image-1'), + prompt: 'Apply the second image as style to the first', + imageInputs: [ + { type: 'image', source: { type: 'url', value: 'https://…/product.png' } }, + { type: 'image', source: { type: 'url', value: 'https://…/style.png' } }, + ], +}) + +// Inpaint via metadata.role === 'mask' (OpenAI gpt-image-1, dall-e-2; fal mask_url) +await generateImage({ + adapter: openaiImage('gpt-image-1'), + prompt: 'Replace the masked region with a tree', + imageInputs: [ + { type: 'image', source: { type: 'url', value: photoUrl } }, + { type: 'image', source: { type: 'url', value: maskUrl }, metadata: { role: 'mask' } }, + ], +}) + +// Image-to-video (OpenAI Sora: single input_reference; fal: image_url + optional end_image_url) +import { generateVideo } from '@tanstack/ai' +import { falVideo } from '@tanstack/ai-fal' + +await generateVideo({ + adapter: falVideo('fal-ai/kling-video/v3/pro/image-to-video'), + prompt: 'Slow cinematic push-in', + imageInputs: [ + { type: 'image', source: { type: 'url', value: firstFrameUrl } }, + { + type: 'image', + source: { type: 'url', value: lastFrameUrl }, + metadata: { role: 'end_frame' }, + }, + ], +}) +``` + +**Role hints** (`metadata.role`): + +| Role | Maps to | +| --------------- | ---------------------------------------------------------------------- | +| `'reference'` | fal `reference_image_urls`; Gemini multimodal part; positional otherwise | +| `'character'` | Same as `'reference'`; Veo `referenceImages` slot | +| `'mask'` | OpenAI `mask` (gpt-image-1, dall-e-2); fal `mask_url` | +| `'control'` | fal `control_image_url` (ControlNet / depth / pose) | +| `'start_frame'` | fal `start_image_url`; Veo `image` | +| `'end_frame'` | fal `end_image_url`; Veo `lastFrame` | + +**Provider support matrix:** + +| Provider | `generateImage` `imageInputs` | `generateVideo` `imageInputs` | +| ------------ | -------------------------------------------------------------------------------- | -------------------------------------------------------------------------- | +| OpenAI | gpt-image-1 / -mini → `images.edit()` (up to 16). dall-e-2 → edit (1). dall-e-3 throws. | Sora-2 / -pro → `input_reference` (single). Throws if >1. | +| Gemini | Native (gemini-\*-flash-image, "nano-banana") → multimodal `contents`. Imagen throws. | No native Veo adapter yet — deferred to a follow-up. | +| fal | 1 input → `image_url`; >1 → `image_urls`; roles → `mask_url` / `control_image_url` / `reference_image_urls`. | 1 input → `image_url`; `start_frame`/`end_frame` → `start_image_url`/`end_image_url`; `reference` → `reference_image_urls`. | +| Grok | Throws — adapter uses OpenAI-compat endpoint; native Imagine API rewrite pending. | n/a | +| OpenRouter | Throws — multimodal injection pending. | n/a | +| Anthropic | n/a (no image generation API). | n/a | + +`videoInputs` and `audioInputs` follow the same `metadata.role` convention +for video-to-video and lipsync flows on fal; other providers throw when +they're passed. + ### 2. Audio Generation (Music, Sound Effects) Distinct from TTS — `generateAudio()` produces non-speech audio content. @@ -607,7 +692,45 @@ generateSpeech({ > Source: Gemini TTS adapter validation; CodeRabbit review of PR #463. -### h. LOW: Writing a logging middleware to see media chunks flow through +### h. HIGH: Passing `imageInputs` to a model that doesn't support image-conditioned generation + +Not every model accepts image-conditioned inputs. Adapters throw a clear +runtime error when the caller passes `imageInputs` to a model that +can't honor it (dall-e-3, Imagen, Grok, OpenRouter), so users learn at +call time rather than getting silently wrong output. + +```typescript +// WRONG — dall-e-3 has no edit/inputs API +generateImage({ + adapter: openaiImage('dall-e-3'), + prompt: 'Edit this', + imageInputs: [{ type: 'image', source: { type: 'url', value: url } }], +}) // throws: model "dall-e-3" does not support imageInputs. + +// WRONG — Imagen is text-to-image only +generateImage({ + adapter: geminiImage('imagen-4.0-generate-001'), + prompt: 'Edit this', + imageInputs: [{ type: 'image', source: { type: 'url', value: url } }], +}) // throws: Imagen does not support imageInputs. + +// CORRECT — use a model that supports edits/inputs +generateImage({ + adapter: openaiImage('gpt-image-1'), // edits up to 16 images + prompt: 'Edit this', + imageInputs: [{ type: 'image', source: { type: 'url', value: url } }], +}) + +generateImage({ + adapter: geminiImage('gemini-3.1-flash-image-preview'), // native multimodal + prompt: 'Edit this', + imageInputs: [{ type: 'image', source: { type: 'url', value: url } }], +}) +``` + +> Source: docs/media/image-generation.md, docs/media/video-generation.md. + +### i. LOW: Writing a logging middleware to see media chunks flow through Every media activity — `generateAudio`, `generateSpeech`, `generateTranscription`, `generateImage`, `generateVideo` — accepts the diff --git a/packages/ai/src/activities/generateImage/index.ts b/packages/ai/src/activities/generateImage/index.ts index b8d173b09..285fb6a56 100644 --- a/packages/ai/src/activities/generateImage/index.ts +++ b/packages/ai/src/activities/generateImage/index.ts @@ -11,7 +11,14 @@ import { resolveDebugOption } from '../../logger/resolve' import type { InternalLogger } from '../../logger/internal-logger' import type { DebugOption } from '../../logger/types' import type { ImageAdapter } from './adapter' -import type { ImageGenerationResult, StreamChunk } from '../../types' +import type { + AudioPart, + ImageGenerationResult, + ImagePart, + MediaInputMetadata, + StreamChunk, + VideoPart, +} from '../../types' // =========================== // Activity Kind @@ -78,6 +85,17 @@ export type ImageActivityOptions< numberOfImages?: number /** Image size in WIDTHxHEIGHT format (e.g., "1024x1024") */ size?: ImageSizeForModel + /** + * Image conditioning inputs for image-to-image, reference-guided, edit, or + * multi-reference generation. Each part may carry `metadata.role` + * (`'reference' | 'mask' | 'control' | 'character'`) to disambiguate intent. + * Adapters that don't support image-conditioned generation throw clearly. + */ + imageInputs?: Array> + /** Video conditioning inputs. Provider support varies; unsupported adapters throw. */ + videoInputs?: Array> + /** Audio conditioning inputs. Provider support varies; unsupported adapters throw. */ + audioInputs?: Array> /** * Whether to stream the image generation result. * When true, returns an AsyncIterable for streaming transport. @@ -210,6 +228,9 @@ async function runGenerateImage< prompt: rest.prompt, numberOfImages: rest.numberOfImages, size: rest.size, + imageInputCount: rest.imageInputs?.length, + videoInputCount: rest.videoInputs?.length, + audioInputCount: rest.audioInputs?.length, modelOptions: rest.modelOptions, timestamp: startTime, }) diff --git a/packages/ai/src/activities/generateVideo/index.ts b/packages/ai/src/activities/generateVideo/index.ts index 4e0e48896..572759cdf 100644 --- a/packages/ai/src/activities/generateVideo/index.ts +++ b/packages/ai/src/activities/generateVideo/index.ts @@ -14,9 +14,13 @@ import type { InternalLogger } from '../../logger/internal-logger' import type { DebugOption } from '../../logger/types' import type { VideoAdapter } from './adapter' import type { + AudioPart, + ImagePart, + MediaInputMetadata, StreamChunk, TokenUsage, VideoJobResult, + VideoPart, VideoStatusResult, VideoUrlResult, } from '../../types' @@ -90,6 +94,16 @@ export type VideoCreateOptions< size?: VideoSizeForAdapter /** Video duration in seconds */ duration?: number + /** + * Image conditioning inputs (start frame, end frame, reference / character + * images). Use `metadata.role` (`'start_frame' | 'end_frame' | 'reference' | + * 'character'`) to disambiguate intent; positional fallback otherwise. + */ + imageInputs?: Array> + /** Video conditioning inputs (video-to-video, source clip). */ + videoInputs?: Array> + /** Audio conditioning inputs (lipsync source, voice reference). */ + audioInputs?: Array> /** * Whether to stream the video generation lifecycle. * When true, returns an AsyncIterable that handles the full @@ -250,7 +264,16 @@ export function generateVideo< async function runCreateVideoJob< TAdapter extends VideoAdapter, >(options: VideoCreateOptions): Promise { - const { adapter, prompt, size, duration, modelOptions } = options + const { + adapter, + prompt, + size, + duration, + modelOptions, + imageInputs, + videoInputs, + audioInputs, + } = options const model = adapter.model const logger: InternalLogger = resolveDebugOption(options.debug) const providerName = @@ -270,6 +293,9 @@ async function runCreateVideoJob< size, duration, modelOptions, + imageInputs, + videoInputs, + audioInputs, logger, }) logger.output(`activity=generateVideo jobId=${result.jobId}`, { @@ -297,7 +323,16 @@ function sleep(ms: number): Promise { async function* runStreamingVideoGeneration< TAdapter extends VideoAdapter, >(options: VideoCreateOptions): AsyncIterable { - const { adapter, prompt, size, duration, modelOptions } = options + const { + adapter, + prompt, + size, + duration, + modelOptions, + imageInputs, + videoInputs, + audioInputs, + } = options const model = adapter.model const runId = options.runId ?? createId('run') const pollingInterval = options.pollingInterval ?? 2000 @@ -333,6 +368,9 @@ async function* runStreamingVideoGeneration< size, duration, modelOptions, + imageInputs, + videoInputs, + audioInputs, logger, }) diff --git a/packages/ai/src/types.ts b/packages/ai/src/types.ts index 798b381b4..1c48f289b 100644 --- a/packages/ai/src/types.ts +++ b/packages/ai/src/types.ts @@ -1470,6 +1470,31 @@ export interface SummarizationResult { // Image Generation Types // ============================================================================ +/** + * Optional role hint on a media input part (image / video / audio). Adapters + * read `metadata.role` to route the part to the provider-specific request + * field — e.g. `'mask'` → OpenAI `mask` / fal `mask_url`, `'end_frame'` → fal + * `end_image_url`, `'reference'` → fal `reference_image_urls`. When omitted + * the adapter falls back to positional routing. + */ +export type MediaInputRole = + | 'reference' + | 'mask' + | 'control' + | 'start_frame' + | 'end_frame' + | 'character' + +/** + * Metadata convention for image / video / audio inputs to media generation. + * Carried on `ImagePart.metadata` / `VideoPart.metadata` / `AudioPart.metadata` + * when used as conditioning inputs to `generateImage()` or `generateVideo()`. + */ +export interface MediaInputMetadata { + /** Optional role hint disambiguating the part's intent for the adapter */ + role?: MediaInputRole +} + /** * Options for image generation. * These are the common options supported across providers. @@ -1486,6 +1511,25 @@ export interface ImageGenerationOptions< numberOfImages?: number /** Image size in WIDTHxHEIGHT format (e.g., "1024x1024") */ size?: TSize + /** + * Image conditioning inputs (reference / mask / control / start frame / + * character). Reuses the multimodal `ImagePart` shape. Adapters map these + * onto the provider-native request — e.g. OpenAI `images.edit()`, Gemini + * multimodal `contents`, fal `image_url` / `image_urls` / `mask_url`. + * Adapters that do not support image-conditioned generation throw a clear + * runtime error when this field is non-empty. + */ + imageInputs?: Array> + /** + * Video conditioning inputs (video-to-video, edit, lipsync source). + * Not all providers support this; adapters throw when unsupported. + */ + videoInputs?: Array> + /** + * Audio conditioning inputs (audio reference, voice cloning, lipsync). + * Not all providers support this; adapters throw when unsupported. + */ + audioInputs?: Array> /** Model-specific options for image generation */ modelOptions?: TProviderOptions /** @@ -1608,6 +1652,24 @@ export interface VideoGenerationOptions< size?: TSize /** Video duration in seconds */ duration?: number + /** + * Image conditioning inputs (start frame, end frame, character / reference + * images). Reuses the multimodal `ImagePart` shape; adapters route by + * `metadata.role` and array position (e.g. OpenAI Sora `input_reference`, + * fal `image_url` / `end_image_url`, Veo `image` / `lastFrame` / + * `referenceImages`). Adapters throw at runtime if unsupported. + */ + imageInputs?: Array> + /** + * Video conditioning inputs (video-to-video edit, source clip). + * Not all providers support this; adapters throw when unsupported. + */ + videoInputs?: Array> + /** + * Audio conditioning inputs (lipsync source, voice reference). + * Not all providers support this; adapters throw when unsupported. + */ + audioInputs?: Array> /** Model-specific options for video generation */ modelOptions?: TProviderOptions /** diff --git a/testing/e2e/src/lib/feature-support.ts b/testing/e2e/src/lib/feature-support.ts index 6d6b950bd..bb52c762b 100644 --- a/testing/e2e/src/lib/feature-support.ts +++ b/testing/e2e/src/lib/feature-support.ts @@ -178,11 +178,21 @@ export const matrix: Record> = { ]), // Gemini excluded: aimock doesn't mock Gemini's Imagen predict endpoint format 'image-gen': new Set(['openai', 'grok']), + // image-to-image (imageInputs on generateImage) routes adapters to wire + // endpoints aimock doesn't yet mock (OpenAI `/v1/images/edits`, Gemini + // multimodal `generateContent`, fal endpoint-specific input fields). + // Adapter-level mapping is covered by unit tests. Populate this set when + // aimock gains support for those endpoints. + 'image-to-image': new Set([]), 'audio-gen': new Set(['gemini', 'elevenlabs']), 'sound-effects': new Set(['elevenlabs']), tts: new Set(['openai', 'grok', 'elevenlabs']), transcription: new Set(['openai', 'grok', 'elevenlabs']), 'video-gen': new Set(['openai']), + // image-to-video (imageInputs on generateVideo) similarly depends on + // aimock mocking Sora's `input_reference` upload field. Populate when + // aimock support lands. + 'image-to-video': new Set([]), // Only Gemini currently surfaces a first-class stateful conversation API via // the adapter (geminiTextInteractions, behind @tanstack/ai-gemini/experimental). 'stateful-interactions': new Set(['gemini']), diff --git a/testing/e2e/src/lib/types.ts b/testing/e2e/src/lib/types.ts index 018e7744f..dcade6b1f 100644 --- a/testing/e2e/src/lib/types.ts +++ b/testing/e2e/src/lib/types.ts @@ -31,11 +31,13 @@ export type Feature = | 'summarize' | 'summarize-stream' | 'image-gen' + | 'image-to-image' | 'audio-gen' | 'sound-effects' | 'tts' | 'transcription' | 'video-gen' + | 'image-to-video' | 'stateful-interactions' export const ALL_PROVIDERS: Provider[] = [ @@ -70,10 +72,12 @@ export const ALL_FEATURES: Feature[] = [ 'summarize', 'summarize-stream', 'image-gen', + 'image-to-image', 'audio-gen', 'sound-effects', 'tts', 'transcription', 'video-gen', + 'image-to-video', 'stateful-interactions', ] From 48d0f62a9ae50379356012b99c6e6b00dc3a1b03 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Fri, 22 May 2026 09:56:39 +0000 Subject: [PATCH 02/11] ci: apply automated fixes --- packages/ai-fal/src/adapters/video.ts | 5 +++- packages/ai-gemini/src/adapters/image.ts | 7 ++++- packages/ai-openai/src/adapters/image.ts | 3 +- .../ai-openai/tests/image-adapter.test.ts | 6 +++- .../skills/ai-core/media-generation/SKILL.md | 28 +++++++++++-------- 5 files changed, 33 insertions(+), 16 deletions(-) diff --git a/packages/ai-fal/src/adapters/video.ts b/packages/ai-fal/src/adapters/video.ts index db6c7e7e0..dafe5226d 100644 --- a/packages/ai-fal/src/adapters/video.ts +++ b/packages/ai-fal/src/adapters/video.ts @@ -37,7 +37,10 @@ function mapVideoInputsToFalFields( const sources: Array = [] for (const part of videoInputs) { const url = videoPartToUrl(part) - if (part.metadata?.role === 'reference' || part.metadata?.role === 'character') { + if ( + part.metadata?.role === 'reference' || + part.metadata?.role === 'character' + ) { references.push(url) } else { sources.push(url) diff --git a/packages/ai-gemini/src/adapters/image.ts b/packages/ai-gemini/src/adapters/image.ts index dfaffd23f..0fbbfadd2 100644 --- a/packages/ai-gemini/src/adapters/image.ts +++ b/packages/ai-gemini/src/adapters/image.ts @@ -240,7 +240,12 @@ export class GeminiImageAdapter< } // For URL sources, prefer passing the URL through as `fileData` when it // looks like a Google Files API URI; otherwise fetch and inline as base64. - if (part.source.value.startsWith('gs://') || /^https?:\/\/generativelanguage\.googleapis\.com\//.test(part.source.value)) { + if ( + part.source.value.startsWith('gs://') || + /^https?:\/\/generativelanguage\.googleapis\.com\//.test( + part.source.value, + ) + ) { return { fileData: { fileUri: part.source.value, diff --git a/packages/ai-openai/src/adapters/image.ts b/packages/ai-openai/src/adapters/image.ts index 1a488cf39..8553555e2 100644 --- a/packages/ai-openai/src/adapters/image.ts +++ b/packages/ai-openai/src/adapters/image.ts @@ -237,7 +237,8 @@ export class OpenAIImageAdapter< image: sourceFiles.length === 1 ? sourceFiles[0]! : sourceFiles, n: numberOfImages ?? 1, stream: false, - ...((modelOptions ?? {}) as Partial), + ...((modelOptions ?? + {}) as Partial), } if (size !== undefined) { request.size = size as Exclude< diff --git a/packages/ai-openai/tests/image-adapter.test.ts b/packages/ai-openai/tests/image-adapter.test.ts index e0ee5dd32..767f3edf1 100644 --- a/packages/ai-openai/tests/image-adapter.test.ts +++ b/packages/ai-openai/tests/image-adapter.test.ts @@ -317,7 +317,11 @@ describe('OpenAI Image Adapter', () => { }, { type: 'image', - source: { type: 'data', value: 'YnllCg==', mimeType: 'image/png' }, + source: { + type: 'data', + value: 'YnllCg==', + mimeType: 'image/png', + }, }, ], logger: testLogger, diff --git a/packages/ai/skills/ai-core/media-generation/SKILL.md b/packages/ai/skills/ai-core/media-generation/SKILL.md index fc3084d2e..4aa6c8e90 100644 --- a/packages/ai/skills/ai-core/media-generation/SKILL.md +++ b/packages/ai/skills/ai-core/media-generation/SKILL.md @@ -226,7 +226,11 @@ await generateImage({ prompt: 'Replace the masked region with a tree', imageInputs: [ { type: 'image', source: { type: 'url', value: photoUrl } }, - { type: 'image', source: { type: 'url', value: maskUrl }, metadata: { role: 'mask' } }, + { + type: 'image', + source: { type: 'url', value: maskUrl }, + metadata: { role: 'mask' }, + }, ], }) @@ -250,8 +254,8 @@ await generateVideo({ **Role hints** (`metadata.role`): -| Role | Maps to | -| --------------- | ---------------------------------------------------------------------- | +| Role | Maps to | +| --------------- | ------------------------------------------------------------------------ | | `'reference'` | fal `reference_image_urls`; Gemini multimodal part; positional otherwise | | `'character'` | Same as `'reference'`; Veo `referenceImages` slot | | `'mask'` | OpenAI `mask` (gpt-image-1, dall-e-2); fal `mask_url` | @@ -261,14 +265,14 @@ await generateVideo({ **Provider support matrix:** -| Provider | `generateImage` `imageInputs` | `generateVideo` `imageInputs` | -| ------------ | -------------------------------------------------------------------------------- | -------------------------------------------------------------------------- | -| OpenAI | gpt-image-1 / -mini → `images.edit()` (up to 16). dall-e-2 → edit (1). dall-e-3 throws. | Sora-2 / -pro → `input_reference` (single). Throws if >1. | -| Gemini | Native (gemini-\*-flash-image, "nano-banana") → multimodal `contents`. Imagen throws. | No native Veo adapter yet — deferred to a follow-up. | -| fal | 1 input → `image_url`; >1 → `image_urls`; roles → `mask_url` / `control_image_url` / `reference_image_urls`. | 1 input → `image_url`; `start_frame`/`end_frame` → `start_image_url`/`end_image_url`; `reference` → `reference_image_urls`. | -| Grok | Throws — adapter uses OpenAI-compat endpoint; native Imagine API rewrite pending. | n/a | -| OpenRouter | Throws — multimodal injection pending. | n/a | -| Anthropic | n/a (no image generation API). | n/a | +| Provider | `generateImage` `imageInputs` | `generateVideo` `imageInputs` | +| ---------- | ------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------- | +| OpenAI | gpt-image-1 / -mini → `images.edit()` (up to 16). dall-e-2 → edit (1). dall-e-3 throws. | Sora-2 / -pro → `input_reference` (single). Throws if >1. | +| Gemini | Native (gemini-\*-flash-image, "nano-banana") → multimodal `contents`. Imagen throws. | No native Veo adapter yet — deferred to a follow-up. | +| fal | 1 input → `image_url`; >1 → `image_urls`; roles → `mask_url` / `control_image_url` / `reference_image_urls`. | 1 input → `image_url`; `start_frame`/`end_frame` → `start_image_url`/`end_image_url`; `reference` → `reference_image_urls`. | +| Grok | Throws — adapter uses OpenAI-compat endpoint; native Imagine API rewrite pending. | n/a | +| OpenRouter | Throws — multimodal injection pending. | n/a | +| Anthropic | n/a (no image generation API). | n/a | `videoInputs` and `audioInputs` follow the same `metadata.role` convention for video-to-video and lipsync flows on fal; other providers throw when @@ -716,7 +720,7 @@ generateImage({ // CORRECT — use a model that supports edits/inputs generateImage({ - adapter: openaiImage('gpt-image-1'), // edits up to 16 images + adapter: openaiImage('gpt-image-1'), // edits up to 16 images prompt: 'Edit this', imageInputs: [{ type: 'image', source: { type: 'url', value: url } }], }) From dfbc5e1174211cc07dc94bbbf3151bcf0c317083 Mon Sep 17 00:00:00 2001 From: Tom Beckenham <34339192+tombeckenham@users.noreply.github.com> Date: Fri, 5 Jun 2026 17:35:02 +1000 Subject: [PATCH 03/11] feat(ai-fal): resolve image-input fields per endpoint from generated SDK type map Replace the fal image-input field heuristic with a per-endpoint mapping generated from @fal-ai/client's EndpointTypeMap (scripts/ generate-fal-image-field-map.ts, run via pnpm generate:fal-image-fields). The committed artifact stores only the 362 endpoints whose field names deviate from the defaults (e.g. nano-banana edit -> image_urls, Kling i2v start frame -> image_url, Veo first-last-frame -> first_frame_url / last_frame_url, Fooocus masks -> mask_image_url); the old heuristic remains the fallback for endpoints newer than the installed SDK. Safety rails: the generated file `satisfies`-checks every field name against the SDK endpoint types (type-only, erased at runtime), and a unit test hashes the installed endpoints.d.ts against the recorded hash so an SDK bump without regeneration fails test:lib with the regen command. Mappers are now typed: both return FalImageInputFields, Pick'ed from the endpoint's real input type via a generated field-name union. Roles resolving to the same list field merge (source + reference on nano-banana); colliding scalar fields throw instead of overwriting. Also fixes the remaining CI lint failures: duplicate @tanstack/ai import and non-null assertion in ai-fal video.ts, switch-exhaustiveness errors in image-inputs.ts (restructured away), and the non-null assertion in ai-openai image.ts. Co-Authored-By: Claude Opus 4.8 (1M context) --- .changeset/image-and-video-inputs.md | 2 +- .prettierignore | 1 + docs/media/image-generation.md | 2 +- docs/media/video-generation.md | 2 +- package.json | 1 + packages/ai-fal/src/adapters/image.ts | 5 +- packages/ai-fal/src/adapters/video.ts | 15 +- packages/ai-fal/src/image/image-inputs.ts | 285 +++++++----- packages/ai-fal/tests/image-inputs.test.ts | 136 +++++- packages/ai-openai/src/adapters/image.ts | 6 +- .../skills/ai-core/media-generation/SKILL.md | 6 +- .../image/generated/image-field-overrides.ts | 430 ++++++++++++++++++ scripts/generate-fal-image-field-map.ts | 359 +++++++++++++++ 13 files changed, 1117 insertions(+), 133 deletions(-) create mode 100644 packages/typescript/ai-fal/src/image/generated/image-field-overrides.ts create mode 100644 scripts/generate-fal-image-field-map.ts diff --git a/.changeset/image-and-video-inputs.md b/.changeset/image-and-video-inputs.md index 3620076c0..b91528002 100644 --- a/.changeset/image-and-video-inputs.md +++ b/.changeset/image-and-video-inputs.md @@ -15,7 +15,7 @@ Provider behavior in this release: - **OpenAI image** — `gpt-image-1` / `gpt-image-1-mini` route to `images.edit()` (up to 16 source images plus optional mask); `dall-e-2` routes to `images.edit()` with one source image; `dall-e-3` throws a clear not-supported error. - **OpenAI video** — Sora-2 / Sora-2-Pro accept a single `input_reference` image; passing more than one throws. - **Gemini image** — Native models (`gemini-*-flash-image`, "nano-banana") receive inputs as multimodal parts in `contents`. Imagen throws (text-only). -- **fal.ai** — Inputs map to fal field names: single → `image_url`, multiple → `image_urls`; `role: 'mask'` → `mask_url`; `role: 'control'` → `control_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`. Video adapter additionally honors `role: 'start_frame'` / `'end_frame'`. +- **fal.ai** — Field names resolve per endpoint from a map generated from the fal SDK's endpoint types (362 endpoints with nonstandard fields, e.g. nano-banana edit → `image_urls`, Kling i2v start frame → `image_url`, Veo first-last-frame → `first_frame_url` / `last_frame_url`). Defaults for endpoints not in the map: single → `image_url`, multiple → `image_urls`; `role: 'mask'` → `mask_url`; `role: 'control'` → `control_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`; video `role: 'start_frame'` / `'end_frame'` → `start_image_url` / `end_image_url`. Regenerate the map after a fal SDK bump with `pnpm generate:fal-image-fields` (a unit test fails when it goes stale). - **Grok**, **OpenRouter** — Throw with a link to issue #618 (full support pending dedicated Imagine / multimodal injection work). - **Anthropic** — Unchanged (no image generation API). diff --git a/.prettierignore b/.prettierignore index c72af168a..1acae11c2 100644 --- a/.prettierignore +++ b/.prettierignore @@ -5,6 +5,7 @@ **/coverage **/dist **/docs +packages/typescript/ai-fal/src/image/generated/ pnpm-lock.yaml .angular diff --git a/docs/media/image-generation.md b/docs/media/image-generation.md index 1a6481da9..c2f5d34f2 100644 --- a/docs/media/image-generation.md +++ b/docs/media/image-generation.md @@ -233,7 +233,7 @@ await generateImage({ | ------------ | --------------------------------------------------------------------------------------------------------- | | **OpenAI** | `gpt-image-1` / `gpt-image-1-mini` → routes to `images.edit()`, up to 16 source images plus optional mask.
`dall-e-2` → `images.edit()` with 1 source image only.
`dall-e-3` → throws (no edit support). | | **Gemini** | Native models (`gemini-*-flash-image`, "nano-banana", etc.) → inputs become multimodal parts in `contents`. Up to ~14 input images.
Imagen models → throws (text-to-image only). | -| **fal.ai** | 1 input → `image_url`; multiple → `image_urls`. `role: 'mask'` → `mask_url`. `role: 'control'` → `control_image_url`. `role: 'reference'` / `'character'` → `reference_image_urls`. Override with `modelOptions` for endpoint-specific fields. | +| **fal.ai** | Field names resolve per endpoint from a map generated from the fal SDK's endpoint types (e.g. nano-banana edit gets `image_urls`, Fooocus masks get `mask_image_url`). Defaults for unknown endpoints: 1 input → `image_url`; multiple → `image_urls`; `role: 'mask'` → `mask_url`; `role: 'control'` → `control_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`. Override with `modelOptions` for endpoint-specific fields. | | **Grok** | Throws — the current adapter wraps Grok's OpenAI-compat endpoint, which doesn't expose image inputs. xAI's native Imagine API support is tracked as a follow-up. | | **OpenRouter** | Throws — multimodal injection into the chat-completions pathway is tracked as a follow-up. | | **Anthropic** | n/a — no image generation API. | diff --git a/docs/media/video-generation.md b/docs/media/video-generation.md index 59ad8ec24..0ff38767b 100644 --- a/docs/media/video-generation.md +++ b/docs/media/video-generation.md @@ -430,7 +430,7 @@ await generateVideo({ | Provider | Image-to-Video Behavior | | ------------ | -------------------------------------------------------------------------------------------------------- | | **OpenAI** | Sora-2 / Sora-2-Pro → first input goes to `input_reference`. Single image only — throws if more than one. | -| **fal.ai** | Single input → `image_url` (start frame). `role: 'end_frame'` → `end_image_url`. `role: 'start_frame'` → `start_image_url`. `role: 'reference'` / `'character'` → `reference_image_urls`. Override per-endpoint via `modelOptions`. | +| **fal.ai** | Field names resolve per endpoint from a map generated from the fal SDK's endpoint types — e.g. `role: 'start_frame'` lands on `image_url` for Kling/Veo image-to-video, `first_frame_url` for first-last-frame endpoints, and `start_image_url` otherwise. Defaults: single input → `image_url` (start frame); `role: 'end_frame'` → `end_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`. Override per-endpoint via `modelOptions`. | | **Gemini** | Veo adapter not yet implemented — `imageInputs` will be supported when Veo lands. | Adapters whose underlying API can't accept image inputs throw a clear diff --git a/package.json b/package.json index 924d53b10..6ffff085c 100644 --- a/package.json +++ b/package.json @@ -37,6 +37,7 @@ "dev:chat": "pnpm --filter ts-react-chat dev", "format": "prettier --experimental-cli --ignore-unknown '**/*' --write", "generate-docs": "node scripts/generate-docs.ts && pnpm run copy:readme", + "generate:fal-image-fields": "tsx scripts/generate-fal-image-field-map.ts", "generate:models": "pnpm generate:models:fetch && pnpm regenerate:models && tsx scripts/sync-provider-models.ts && pnpm format", "generate:models:fetch": "tsx scripts/fetch-openrouter-models.ts", "regenerate:models": "tsx scripts/convert-openrouter-models.ts", diff --git a/packages/ai-fal/src/adapters/image.ts b/packages/ai-fal/src/adapters/image.ts index 54a2134d1..88c885a74 100644 --- a/packages/ai-fal/src/adapters/image.ts +++ b/packages/ai-fal/src/adapters/image.ts @@ -103,7 +103,10 @@ export class FalImageAdapter extends BaseImageAdapter< // Order matters: modelOptions first (so user overrides win for // mask_url / control_image_url / reference_image_urls), then size, // then derived image-input fields, then prompt / num_images. - const inputFields = mapImageInputsToFalFields(options.imageInputs) + const inputFields = mapImageInputsToFalFields( + this.model, + options.imageInputs, + ) const input = { ...options.modelOptions, ...sizeParams, diff --git a/packages/ai-fal/src/adapters/video.ts b/packages/ai-fal/src/adapters/video.ts index dafe5226d..a46db5bc5 100644 --- a/packages/ai-fal/src/adapters/video.ts +++ b/packages/ai-fal/src/adapters/video.ts @@ -8,10 +8,12 @@ import { } from '../utils' import { mapVideoSizeToFalFormat } from '../video/video-provider-options' import { mapImageInputsToFalVideoFields } from '../image/image-inputs' -import type { AudioPart, MediaInputMetadata, VideoPart } from '@tanstack/ai' import type { + AudioPart, + MediaInputMetadata, VideoGenerationOptions, VideoJobResult, + VideoPart, VideoStatusResult, VideoUrlResult, } from '@tanstack/ai' @@ -60,12 +62,12 @@ function mapAudioInputsToFalFields( audioInputs?: ReadonlyArray>, ): Record { if (!audioInputs || audioInputs.length === 0) return {} - if (audioInputs.length > 1) { + const [part, ...rest] = audioInputs + if (!part || rest.length > 0) { throw new Error( - `fal: multiple audioInputs are not supported (received ${audioInputs.length}).`, + `fal: exactly one audioInput is supported (received ${audioInputs.length}).`, ) } - const part = audioInputs[0]! return { audio_url: part.source.type === 'url' @@ -162,7 +164,10 @@ export class FalVideoAdapter extends BaseVideoAdapter< try { const sizeParams = mapVideoSizeToFalFormat(size) - const inputImageFields = mapImageInputsToFalVideoFields(imageInputs) + const inputImageFields = mapImageInputsToFalVideoFields( + this.model, + imageInputs, + ) const videoFields = mapVideoInputsToFalFields(videoInputs) const audioFields = mapAudioInputsToFalFields(audioInputs) diff --git a/packages/ai-fal/src/image/image-inputs.ts b/packages/ai-fal/src/image/image-inputs.ts index 0a5a06ca3..f6943b837 100644 --- a/packages/ai-fal/src/image/image-inputs.ts +++ b/packages/ai-fal/src/image/image-inputs.ts @@ -1,64 +1,159 @@ +import { FAL_IMAGE_FIELD_OVERRIDES } from './generated/image-field-overrides' +import type { + FalImageFieldName, + FalImageFieldOverride, +} from './generated/image-field-overrides' import type { ImagePart, MediaInputMetadata } from '@tanstack/ai' +import type { FalModel, FalModelInput } from '../model-meta' /** - * Map TanStack `imageInputs` onto fal.ai endpoint fields. + * The image-conditioning fields the mappers may set, narrowed to the ones + * that actually exist on the given endpoint's input type. For endpoints + * unknown to the installed `@fal-ai/client` this widens to all known field + * names. + */ +export type FalImageInputFields = Partial< + Pick< + FalModelInput, + Extract, FalImageFieldName> + > +> + +/** + * Default field per routing role. Endpoint-specific deviations live in the + * generated `FAL_IMAGE_FIELD_OVERRIDES` map (regenerate with + * `pnpm generate:fal-image-fields`); these defaults must stay in sync with + * `DEFAULTS` in scripts/generate-fal-image-field-map.ts. + */ +const DEFAULT_FIELDS = { + single: 'image_url', + multi: 'image_urls', + mask: 'mask_url', + control: 'control_image_url', + reference: 'reference_image_urls', + start: 'start_image_url', + end: 'end_image_url', +} satisfies Required + +/** + * Field names that accept an array of images. The generator asserts the + * SDK types agree with this set, so wrap-vs-scalar decisions stay correct. + */ +const LIST_FIELDS = new Set([ + 'image_urls', + 'input_image_urls', + 'ref_image_urls', + 'reference_image_urls', +]) + +/** Resolve the per-role field names for a model: defaults + generated overrides. */ +function fieldSpecFor(model: string): Required { + const overrides = ( + FAL_IMAGE_FIELD_OVERRIDES as Record + )[model] + return { ...DEFAULT_FIELDS, ...overrides } +} + +/** + * Assign URLs to a field, wrapping or unwrapping based on whether the field + * takes an array. When two roles resolve to the same list field (e.g. + * sources and references both land on `image_urls` for nano-banana edit) + * the values are merged in assignment order; two roles resolving to the + * same scalar field is ambiguous and throws. Throws when multiple images + * target a scalar field. + */ +function assignField( + fields: Record, + field: string, + urls: Array, + model: string, + what: string, +): void { + if (urls.length === 0) return + const existing = fields[field] + if (LIST_FIELDS.has(field)) { + fields[field] = Array.isArray(existing) ? [...existing, ...urls] : urls + } else if (existing !== undefined) { + throw new Error( + `fal: multiple inputs map to '${field}' on model ${model}. Drop one of the conflicting inputs or pass the field explicitly via modelOptions.`, + ) + } else if (urls.length === 1) { + fields[field] = urls[0] + } else { + throw new Error( + `fal: model ${model} accepts a single ${what} image via '${field}' (received ${urls.length}).`, + ) + } +} + +interface RoleBuckets { + sources: Array + masks: Array + controls: Array + references: Array + starts: Array + ends: Array +} + +function bucketByRole( + imageInputs: ReadonlyArray>, +): RoleBuckets { + const buckets: RoleBuckets = { + sources: [], + masks: [], + controls: [], + references: [], + starts: [], + ends: [], + } + for (const part of imageInputs) { + const url = imagePartToUrl(part) + const role = part.metadata?.role + if (role === 'mask') buckets.masks.push(url) + else if (role === 'control') buckets.controls.push(url) + else if (role === 'reference' || role === 'character') + buckets.references.push(url) + else if (role === 'start_frame') buckets.starts.push(url) + else if (role === 'end_frame') buckets.ends.push(url) + else buckets.sources.push(url) + } + return buckets +} + +/** + * Map TanStack `imageInputs` onto fal.ai image-endpoint fields. * * fal endpoints use different field names for image-conditioned generation * (~80% use `image_url` for single; the rest use `image_urls`, - * `reference_image_urls`, `mask_url`, `control_image_url`, etc.). Without - * per-endpoint metadata we apply this heuristic: + * `reference_image_urls`, `mask_url`, `control_image_url`, etc.). Field + * names are resolved per endpoint from the generated + * `FAL_IMAGE_FIELD_OVERRIDES` map (derived from the fal SDK's endpoint + * types), falling back to the defaults above for endpoints the installed + * SDK doesn't know: * - * - parts with `metadata.role === 'mask'` → `mask_url` (single) - * - parts with `metadata.role === 'control'` → `control_image_url` (single) - * - parts with `metadata.role === 'reference'` → `reference_image_urls` (array) - * - parts with `metadata.role === 'character'` → `reference_image_urls` (array) - * - remaining parts (no role, or unknown role): - * - exactly 1 part → `image_url` - * - >1 parts → `image_urls` + * - parts with `metadata.role === 'mask'` → spec.mask (single) + * - parts with `metadata.role === 'control'` → spec.control (single) + * - `role === 'reference' | 'character'` → spec.reference + * - `role === 'start_frame' | 'end_frame'` → treated as sources (frame + * roles only apply to video generation) + * - remaining parts → spec.single / spec.multi * * Users can always override the resulting field shape via `modelOptions` * (spread before these fields), or pass everything through `modelOptions` - * directly when the heuristic doesn't match an obscure endpoint. - * - * This mapping is interim and will be replaced by a per-endpoint mapping - * sourced from the `@fal-ai/schemas` library once it lands. + * directly when the mapping doesn't match an obscure endpoint. */ -export function mapImageInputsToFalFields( +export function mapImageInputsToFalFields( + model: TModel, imageInputs?: ReadonlyArray>, -): Record { +): FalImageInputFields { if (!imageInputs || imageInputs.length === 0) return {} - const fields: Record = {} - - const masks: Array = [] - const controls: Array = [] - const references: Array = [] - const sources: Array = [] - - for (const part of imageInputs) { - const url = imagePartToUrl(part) - const role = part.metadata?.role - switch (role) { - case 'mask': - masks.push(url) - break - case 'control': - controls.push(url) - break - case 'reference': - case 'character': - references.push(url) - break - case 'start_frame': - case 'end_frame': - // Frame roles aren't meaningful for image generation; treat as the - // primary source. Video adapter handles start/end framing. - sources.push(url) - break - default: - sources.push(url) - } - } + const spec = fieldSpecFor(model) + const { sources, masks, controls, references, starts, ends } = + bucketByRole(imageInputs) + // Frame roles aren't meaningful for image generation; treat as the + // primary source. The video mapper handles start/end framing. + const allSources = [...sources, ...starts, ...ends] if (masks.length > 1) { throw new Error( @@ -71,17 +166,14 @@ export function mapImageInputsToFalFields( ) } - if (masks[0]) fields.mask_url = masks[0] - if (controls[0]) fields.control_image_url = controls[0] - if (references.length > 0) fields.reference_image_urls = references - - if (sources.length === 1) { - fields.image_url = sources[0] - } else if (sources.length > 1) { - fields.image_urls = sources - } + const fields: Record = {} + const sourceField = allSources.length > 1 ? spec.multi : spec.single + assignField(fields, sourceField, allSources, model, 'source') + assignField(fields, spec.reference, references, model, 'reference') + assignField(fields, spec.mask, masks, model, 'mask') + assignField(fields, spec.control, controls, model, 'control') - return fields + return fields as FalImageInputFields } /** @@ -90,68 +182,49 @@ export function mapImageInputsToFalFields( * Video endpoints often expose a start frame as `image_url` (76% of i2v * models) plus an optional `end_image_url`. Multi-reference video models * (Kling O3, Seedance reference-to-video) use `reference_image_urls` or - * `image_urls`. Mapping: + * `image_urls`. Field names resolve through the same generated override + * map as the image mapper — e.g. `role: 'start_frame'` lands on `image_url` + * for Kling/Veo image-to-video and `first_frame_url` for Pixverse. Mapping: * - * - `metadata.role === 'start_frame'` → `start_image_url` - * - `metadata.role === 'end_frame'` → `end_image_url` - * - `metadata.role === 'reference' | 'character'` → `reference_image_urls` - * - remaining parts (no role or unknown role): - * - exactly 1 part → `image_url` - * - >1 parts → `image_urls` + * - `metadata.role === 'start_frame'` → spec.start + * - `metadata.role === 'end_frame'` → spec.end + * - `metadata.role === 'reference' | 'character'` → spec.reference + * - remaining parts (any other / no role) → spec.single / spec.multi */ -export function mapImageInputsToFalVideoFields( +export function mapImageInputsToFalVideoFields( + model: TModel, imageInputs?: ReadonlyArray>, -): Record { +): FalImageInputFields { if (!imageInputs || imageInputs.length === 0) return {} - const fields: Record = {} - - const startFrames: Array = [] - const endFrames: Array = [] - const references: Array = [] - const sources: Array = [] - - for (const part of imageInputs) { - const url = imagePartToUrl(part) - const role = part.metadata?.role - switch (role) { - case 'start_frame': - startFrames.push(url) - break - case 'end_frame': - endFrames.push(url) - break - case 'reference': - case 'character': - references.push(url) - break - default: - sources.push(url) - } - } + const spec = fieldSpecFor(model) + const { sources, masks, controls, references, starts, ends } = + bucketByRole(imageInputs) + // Mask / control roles have no video-specific routing; treat as sources. + const allSources = [...sources, ...masks, ...controls] - if (startFrames.length > 1) { + if (starts.length > 1) { throw new Error( - `fal: only one input with metadata.role === 'start_frame' is supported (received ${startFrames.length}).`, + `fal: only one input with metadata.role === 'start_frame' is supported (received ${starts.length}).`, ) } - if (endFrames.length > 1) { + if (ends.length > 1) { throw new Error( - `fal: only one input with metadata.role === 'end_frame' is supported (received ${endFrames.length}).`, + `fal: only one input with metadata.role === 'end_frame' is supported (received ${ends.length}).`, ) } - if (startFrames[0]) fields.start_image_url = startFrames[0] - if (endFrames[0]) fields.end_image_url = endFrames[0] - if (references.length > 0) fields.reference_image_urls = references - - if (sources.length === 1) { - fields.image_url = sources[0] - } else if (sources.length > 1) { - fields.image_urls = sources - } - - return fields + const fields: Record = {} + const sourceField = allSources.length > 1 ? spec.multi : spec.single + assignField(fields, sourceField, allSources, model, 'source') + assignField(fields, spec.reference, references, model, 'reference') + // Frame roles assign last: when an endpoint routes the start frame to its + // generic source field (e.g. Kling image-to-video) and an unroled source + // was also provided, assignField rejects the ambiguous combination. + assignField(fields, spec.start, starts, model, 'start frame') + assignField(fields, spec.end, ends, model, 'end frame') + + return fields as FalImageInputFields } /** diff --git a/packages/ai-fal/tests/image-inputs.test.ts b/packages/ai-fal/tests/image-inputs.test.ts index 0ed534080..2e5a13eea 100644 --- a/packages/ai-fal/tests/image-inputs.test.ts +++ b/packages/ai-fal/tests/image-inputs.test.ts @@ -1,10 +1,17 @@ +import { createHash } from 'node:crypto' +import { readFileSync } from 'node:fs' +import { createRequire } from 'node:module' import { describe, expect, it } from 'vitest' import { mapImageInputsToFalFields, mapImageInputsToFalVideoFields, } from '../src/image/image-inputs' +import { FAL_ENDPOINTS_DTS_SHA256 } from '../src/image/generated/image-field-overrides' import type { ImagePart, MediaInputMetadata } from '@tanstack/ai' +/** A model id unknown to the SDK — exercises the default field mapping. */ +const UNKNOWN_MODEL = 'custom-org/not-in-sdk' + function urlPart( value: string, metadata?: MediaInputMetadata, @@ -18,19 +25,21 @@ function urlPart( describe('mapImageInputsToFalFields', () => { it('returns an empty object when imageInputs is missing or empty', () => { - expect(mapImageInputsToFalFields(undefined)).toEqual({}) - expect(mapImageInputsToFalFields([])).toEqual({}) + expect(mapImageInputsToFalFields(UNKNOWN_MODEL, undefined)).toEqual({}) + expect(mapImageInputsToFalFields(UNKNOWN_MODEL, [])).toEqual({}) }) it('routes a single source to image_url', () => { expect( - mapImageInputsToFalFields([urlPart('https://example.com/a.png')]), + mapImageInputsToFalFields(UNKNOWN_MODEL, [ + urlPart('https://example.com/a.png'), + ]), ).toEqual({ image_url: 'https://example.com/a.png' }) }) it('routes multiple sources to image_urls', () => { expect( - mapImageInputsToFalFields([ + mapImageInputsToFalFields(UNKNOWN_MODEL, [ urlPart('https://example.com/a.png'), urlPart('https://example.com/b.png'), ]), @@ -41,7 +50,7 @@ describe('mapImageInputsToFalFields', () => { it('routes role=mask to mask_url alongside the source image_url', () => { expect( - mapImageInputsToFalFields([ + mapImageInputsToFalFields(UNKNOWN_MODEL, [ urlPart('https://example.com/img.png'), urlPart('https://example.com/mask.png', { role: 'mask' }), ]), @@ -53,7 +62,7 @@ describe('mapImageInputsToFalFields', () => { it('routes role=reference to reference_image_urls', () => { expect( - mapImageInputsToFalFields([ + mapImageInputsToFalFields(UNKNOWN_MODEL, [ urlPart('https://example.com/product.png'), urlPart('https://example.com/style.png', { role: 'reference' }), urlPart('https://example.com/character.png', { role: 'character' }), @@ -69,7 +78,7 @@ describe('mapImageInputsToFalFields', () => { it('routes role=control to control_image_url', () => { expect( - mapImageInputsToFalFields([ + mapImageInputsToFalFields(UNKNOWN_MODEL, [ urlPart('https://example.com/img.png'), urlPart('https://example.com/depth.png', { role: 'control' }), ]), @@ -81,7 +90,7 @@ describe('mapImageInputsToFalFields', () => { it('encodes data sources as data URIs', () => { expect( - mapImageInputsToFalFields([ + mapImageInputsToFalFields(UNKNOWN_MODEL, [ { type: 'image', source: { type: 'data', value: 'aGVsbG8=', mimeType: 'image/png' }, @@ -92,23 +101,61 @@ describe('mapImageInputsToFalFields', () => { it('throws when more than one mask is provided', () => { expect(() => - mapImageInputsToFalFields([ + mapImageInputsToFalFields(UNKNOWN_MODEL, [ urlPart('https://example.com/m1.png', { role: 'mask' }), urlPart('https://example.com/m2.png', { role: 'mask' }), ]), ).toThrow(/only one input with metadata.role === 'mask'/) }) + + describe('generated endpoint overrides', () => { + it('routes a single source to image_urls on endpoints without a scalar field', () => { + // nano-banana edit has image_urls but no image_url + expect( + mapImageInputsToFalFields('fal-ai/nano-banana/edit', [ + urlPart('https://example.com/a.png'), + ]), + ).toEqual({ image_urls: ['https://example.com/a.png'] }) + }) + + it('merges sources and references when both resolve to the same list field', () => { + expect( + mapImageInputsToFalFields('fal-ai/nano-banana/edit', [ + urlPart('https://example.com/product.png'), + urlPart('https://example.com/style.png', { role: 'reference' }), + ]), + ).toEqual({ + image_urls: [ + 'https://example.com/product.png', + 'https://example.com/style.png', + ], + }) + }) + + it('routes role=mask to endpoint-specific mask field names', () => { + // gpt-image-1.5 edit uses mask_image_url instead of mask_url + expect( + mapImageInputsToFalFields('fal-ai/gpt-image-1.5/edit', [ + urlPart('https://example.com/img.png'), + urlPart('https://example.com/mask.png', { role: 'mask' }), + ]), + ).toEqual({ + image_urls: ['https://example.com/img.png'], + mask_image_url: 'https://example.com/mask.png', + }) + }) + }) }) describe('mapImageInputsToFalVideoFields', () => { it('returns empty for missing/empty inputs', () => { - expect(mapImageInputsToFalVideoFields(undefined)).toEqual({}) - expect(mapImageInputsToFalVideoFields([])).toEqual({}) + expect(mapImageInputsToFalVideoFields(UNKNOWN_MODEL, undefined)).toEqual({}) + expect(mapImageInputsToFalVideoFields(UNKNOWN_MODEL, [])).toEqual({}) }) it('routes a single positional source to image_url (start frame)', () => { expect( - mapImageInputsToFalVideoFields([ + mapImageInputsToFalVideoFields(UNKNOWN_MODEL, [ urlPart('https://example.com/start.png'), ]), ).toEqual({ image_url: 'https://example.com/start.png' }) @@ -116,7 +163,7 @@ describe('mapImageInputsToFalVideoFields', () => { it('routes role=start_frame to start_image_url and role=end_frame to end_image_url', () => { expect( - mapImageInputsToFalVideoFields([ + mapImageInputsToFalVideoFields(UNKNOWN_MODEL, [ urlPart('https://example.com/a.png', { role: 'start_frame' }), urlPart('https://example.com/z.png', { role: 'end_frame' }), ]), @@ -128,7 +175,7 @@ describe('mapImageInputsToFalVideoFields', () => { it('routes role=reference to reference_image_urls', () => { expect( - mapImageInputsToFalVideoFields([ + mapImageInputsToFalVideoFields(UNKNOWN_MODEL, [ urlPart('https://example.com/start.png'), urlPart('https://example.com/character.png', { role: 'reference' }), ]), @@ -137,4 +184,65 @@ describe('mapImageInputsToFalVideoFields', () => { reference_image_urls: ['https://example.com/character.png'], }) }) + + describe('generated endpoint overrides', () => { + it('routes role=start_frame to the source field on image-to-video endpoints', () => { + // Kling i2v takes the start frame as plain image_url, the end frame + // as tail_image_url + expect( + mapImageInputsToFalVideoFields( + 'fal-ai/kling-video/v2.5-turbo/pro/image-to-video', + [ + urlPart('https://example.com/start.png', { role: 'start_frame' }), + urlPart('https://example.com/end.png', { role: 'end_frame' }), + ], + ), + ).toEqual({ + image_url: 'https://example.com/start.png', + tail_image_url: 'https://example.com/end.png', + }) + }) + + it('routes frame roles to first/last frame fields on frame-to-video endpoints', () => { + expect( + mapImageInputsToFalVideoFields( + 'fal-ai/veo3.1/first-last-frame-to-video', + [ + urlPart('https://example.com/first.png', { role: 'start_frame' }), + urlPart('https://example.com/last.png', { role: 'end_frame' }), + ], + ), + ).toEqual({ + first_frame_url: 'https://example.com/first.png', + last_frame_url: 'https://example.com/last.png', + }) + }) + + it('throws when a source and start_frame both resolve to the same scalar field', () => { + expect(() => + mapImageInputsToFalVideoFields( + 'fal-ai/kling-video/v2.5-turbo/pro/image-to-video', + [ + urlPart('https://example.com/source.png'), + urlPart('https://example.com/start.png', { role: 'start_frame' }), + ], + ), + ).toThrow(/multiple inputs map to 'image_url'/) + }) + }) +}) + +describe('generated image-field-overrides artifact', () => { + it('matches the installed @fal-ai/client endpoint types', () => { + const require = createRequire(import.meta.url) + const endpointsJs = require.resolve('@fal-ai/client/endpoints') + const endpointsDts = endpointsJs.replace(/\.js$/, '.d.ts') + const hash = createHash('sha256') + .update(readFileSync(endpointsDts)) + .digest('hex') + expect( + hash, + 'image-field-overrides.ts is stale for the installed @fal-ai/client. Run: pnpm generate:fal-image-fields', + ).toBe(FAL_ENDPOINTS_DTS_SHA256) + }) }) diff --git a/packages/ai-openai/src/adapters/image.ts b/packages/ai-openai/src/adapters/image.ts index 8553555e2..a9a01abff 100644 --- a/packages/ai-openai/src/adapters/image.ts +++ b/packages/ai-openai/src/adapters/image.ts @@ -223,6 +223,7 @@ export class OpenAIImageAdapter< const sourceFiles = await Promise.all( sourceParts.map((part, i) => imagePartToFile(part, `source-${i}`)), ) + const [firstSourceFile] = sourceFiles const maskFile = maskParts[0] ? await imagePartToFile(maskParts[0], 'mask') : undefined @@ -234,7 +235,10 @@ export class OpenAIImageAdapter< const request: OpenAI_SDK.Images.ImageEditParamsNonStreaming = { model, prompt, - image: sourceFiles.length === 1 ? sourceFiles[0]! : sourceFiles, + image: + firstSourceFile && sourceFiles.length === 1 + ? firstSourceFile + : sourceFiles, n: numberOfImages ?? 1, stream: false, ...((modelOptions ?? diff --git a/packages/ai/skills/ai-core/media-generation/SKILL.md b/packages/ai/skills/ai-core/media-generation/SKILL.md index 4aa6c8e90..af67bf332 100644 --- a/packages/ai/skills/ai-core/media-generation/SKILL.md +++ b/packages/ai/skills/ai-core/media-generation/SKILL.md @@ -260,8 +260,8 @@ await generateVideo({ | `'character'` | Same as `'reference'`; Veo `referenceImages` slot | | `'mask'` | OpenAI `mask` (gpt-image-1, dall-e-2); fal `mask_url` | | `'control'` | fal `control_image_url` (ControlNet / depth / pose) | -| `'start_frame'` | fal `start_image_url`; Veo `image` | -| `'end_frame'` | fal `end_image_url`; Veo `lastFrame` | +| `'start_frame'` | fal `start_image_url` (or the endpoint's field, e.g. `image_url` on Kling i2v); Veo `image` | +| `'end_frame'` | fal `end_image_url` (or e.g. `tail_image_url` / `last_frame_url`); Veo `lastFrame` | **Provider support matrix:** @@ -269,7 +269,7 @@ await generateVideo({ | ---------- | ------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------- | | OpenAI | gpt-image-1 / -mini → `images.edit()` (up to 16). dall-e-2 → edit (1). dall-e-3 throws. | Sora-2 / -pro → `input_reference` (single). Throws if >1. | | Gemini | Native (gemini-\*-flash-image, "nano-banana") → multimodal `contents`. Imagen throws. | No native Veo adapter yet — deferred to a follow-up. | -| fal | 1 input → `image_url`; >1 → `image_urls`; roles → `mask_url` / `control_image_url` / `reference_image_urls`. | 1 input → `image_url`; `start_frame`/`end_frame` → `start_image_url`/`end_image_url`; `reference` → `reference_image_urls`. | +| fal | Per-endpoint field names from a generated map (`pnpm generate:fal-image-fields`). Defaults: 1 input → `image_url`; >1 → `image_urls`; roles → `mask_url` / `control_image_url` / `reference_image_urls`. | Per-endpoint map (e.g. Kling i2v start frame → `image_url`). Defaults: 1 input → `image_url`; `start_frame`/`end_frame` → `start_image_url`/`end_image_url`; `reference` → `reference_image_urls`. | | Grok | Throws — adapter uses OpenAI-compat endpoint; native Imagine API rewrite pending. | n/a | | OpenRouter | Throws — multimodal injection pending. | n/a | | Anthropic | n/a (no image generation API). | n/a | diff --git a/packages/typescript/ai-fal/src/image/generated/image-field-overrides.ts b/packages/typescript/ai-fal/src/image/generated/image-field-overrides.ts new file mode 100644 index 000000000..55e1f67eb --- /dev/null +++ b/packages/typescript/ai-fal/src/image/generated/image-field-overrides.ts @@ -0,0 +1,430 @@ +/* eslint-disable */ +// --------------------------------------------------------------------------- +// AUTO-GENERATED — do not edit by hand. +// +// Generated from @fal-ai/client@1.10.1 EndpointTypeMap by +// scripts/generate-fal-image-field-map.ts. Regenerate after bumping +// @fal-ai/client: +// +// pnpm tsx scripts/generate-fal-image-field-map.ts +// +// Maps fal endpoint ids to the image-conditioning input fields they accept +// whenever those differ from the defaults in image-inputs.ts. Endpoints +// matching the defaults are omitted. The `satisfies` clause below checks +// every field name against the SDK's endpoint input types at compile time +// (type-only import — nothing from endpoints.d.ts is shipped at runtime). +// --------------------------------------------------------------------------- +import type { EndpointTypeMap } from '@fal-ai/client/endpoints' + +/** sha256 of the endpoints.d.ts this file was generated from. */ +export const FAL_ENDPOINTS_DTS_SHA256 = + 'a071f97905b8a1068f924c74108b881362f20c9054620b70359c200404e23b4e' + +/** Every input field name the image-input mappers may emit. */ +export type FalImageFieldName = + | 'control_image_url' + | 'end_image_url' + | 'first_frame_url' + | 'first_image_url' + | 'image_data_url' + | 'image_url' + | 'image_urls' + | 'input_image_url' + | 'input_image_urls' + | 'last_frame_url' + | 'mask_image_url' + | 'mask_url' + | 'ref_image_urls' + | 'reference_image_url' + | 'reference_image_urls' + | 'start_image_url' + | 'tail_image_url' + +/** + * Per-role input-field overrides. Roles: `single` / `multi` route unroled + * source images; the rest mirror `MediaInputRole` (`start` / `end` map the + * `start_frame` / `end_frame` roles). + */ +export interface FalImageFieldOverride { + single?: string + multi?: string + mask?: string + control?: string + reference?: string + start?: string + end?: string +} + +type InputFieldOf = Extract< + keyof EndpointTypeMap[K]['input'], + string +> + +export const FAL_IMAGE_FIELD_OVERRIDES = { + 'bytedance/lynx': { start: 'image_url' }, + 'decart/lucy-i2v': { start: 'image_url' }, + 'fal-ai/ai-avatar/multi': { start: 'image_url' }, + 'fal-ai/ai-avatar/multi-text': { start: 'image_url' }, + 'fal-ai/ai-avatar/single-text': { start: 'image_url' }, + 'fal-ai/bytedance/dreamactor/v2': { start: 'image_url' }, + 'fal-ai/bytedance/omnihuman': { start: 'image_url' }, + 'fal-ai/bytedance/omnihuman/v1.5': { start: 'image_url' }, + 'fal-ai/bytedance/seed/v2/mini': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/bytedance/seedance/v1.5/pro/image-to-video': { start: 'image_url' }, + 'fal-ai/bytedance/seedance/v1/lite/image-to-video': { start: 'image_url' }, + 'fal-ai/bytedance/seedance/v1/lite/reference-to-video': { multi: 'reference_image_urls' }, + 'fal-ai/bytedance/seedance/v1/pro/fast/image-to-video': { start: 'image_url' }, + 'fal-ai/bytedance/seedance/v1/pro/image-to-video': { start: 'image_url' }, + 'fal-ai/bytedance/seedream/v4.5/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/bytedance/seedream/v4/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/bytedance/seedream/v5/lite/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/cogvideox-5b/image-to-video': { start: 'image_url' }, + 'fal-ai/controlnext': { start: 'image_url' }, + 'fal-ai/cosmos-predict-2.5/image-to-video': { start: 'image_url' }, + 'fal-ai/creatify/aurora': { start: 'image_url' }, + 'fal-ai/davinci-magihuman': { start: 'image_url' }, + 'fal-ai/dreamomni2/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/echomimic-v3': { start: 'image_url' }, + 'fal-ai/fast-svd-lcm': { start: 'image_url' }, + 'fal-ai/firered-image-edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/firered-image-edit-v1.1': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-2-flex/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-2-klein-4b-base-trainer': { single: 'image_data_url' }, + 'fal-ai/flux-2-klein-4b-base-trainer/edit': { single: 'image_data_url' }, + 'fal-ai/flux-2-klein-9b-base-trainer': { single: 'image_data_url' }, + 'fal-ai/flux-2-klein-9b-base-trainer/edit': { single: 'image_data_url' }, + 'fal-ai/flux-2-lora-gallery/add-background': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-2-lora-gallery/apartment-staging': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-2-lora-gallery/face-to-full-portrait': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-2-lora-gallery/multiple-angles': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-2-lora-gallery/virtual-tryon': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-2-max/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-2-pro/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-2-trainer': { single: 'image_data_url' }, + 'fal-ai/flux-2-trainer-v2': { single: 'image_data_url' }, + 'fal-ai/flux-2-trainer-v2/edit': { single: 'image_data_url' }, + 'fal-ai/flux-2-trainer/edit': { single: 'image_data_url' }, + 'fal-ai/flux-2/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-2/flash/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-2/klein/4b/base/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-2/klein/4b/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-2/klein/4b/edit/lora': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-2/klein/9b/base/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-2/klein/9b/base/edit/lora': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-2/klein/9b/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-2/klein/9b/edit/lora': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-2/lora/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-2/turbo/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-general': { reference: 'reference_image_url' }, + 'fal-ai/flux-general/differential-diffusion': { reference: 'reference_image_url' }, + 'fal-ai/flux-general/image-to-image': { reference: 'reference_image_url' }, + 'fal-ai/flux-general/inpainting': { reference: 'reference_image_url' }, + 'fal-ai/flux-general/rf-inversion': { reference: 'reference_image_url' }, + 'fal-ai/flux-kontext-trainer': { single: 'image_data_url' }, + 'fal-ai/flux-pro/kontext/max/multi': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-pro/kontext/multi': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/flux-pulid': { reference: 'reference_image_url' }, + 'fal-ai/fooocus': { mask: 'mask_image_url' }, + 'fal-ai/fooocus/image-prompt': { mask: 'mask_image_url' }, + 'fal-ai/fooocus/inpaint': { mask: 'mask_image_url' }, + 'fal-ai/framepack': { start: 'image_url' }, + 'fal-ai/framepack/f1': { start: 'image_url' }, + 'fal-ai/framepack/flf2v': { start: 'image_url' }, + 'fal-ai/gemini-25-flash-image/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/gemini-3-pro-image-preview/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/gemini-3.1-flash-image-preview/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/gemini-flash-edit/multi': { single: 'input_image_urls', multi: 'input_image_urls', reference: 'input_image_urls' }, + 'fal-ai/glm-image/image-to-image': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/goal-force': { start: 'image_url' }, + 'fal-ai/got-ocr/v2': { single: 'input_image_urls', multi: 'input_image_urls', reference: 'input_image_urls' }, + 'fal-ai/gpt-image-1-mini/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/gpt-image-1.5/edit': { single: 'image_urls', mask: 'mask_image_url', reference: 'image_urls' }, + 'fal-ai/gpt-image-1/edit-image': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/heygen/avatar4/image-to-video': { start: 'image_url' }, + 'fal-ai/hunyuan-3d/v3.1/pro/image-to-3d': { single: 'input_image_url' }, + 'fal-ai/hunyuan-3d/v3.1/rapid/image-to-3d': { single: 'input_image_url' }, + 'fal-ai/hunyuan-avatar': { start: 'image_url' }, + 'fal-ai/hunyuan-custom': { start: 'image_url' }, + 'fal-ai/hunyuan-image/v3/instruct/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/hunyuan-portrait': { start: 'image_url' }, + 'fal-ai/hunyuan-video-image-to-video': { start: 'image_url' }, + 'fal-ai/hunyuan-video-img2vid-lora': { start: 'image_url' }, + 'fal-ai/hunyuan-video-v1.5/image-to-video': { start: 'image_url' }, + 'fal-ai/hunyuan3d-v21': { single: 'input_image_url' }, + 'fal-ai/hunyuan3d-v3/image-to-3d': { single: 'input_image_url' }, + 'fal-ai/hunyuan3d-v3/sketch-to-3d': { single: 'input_image_url' }, + 'fal-ai/hunyuan3d/v2': { single: 'input_image_url' }, + 'fal-ai/hunyuan3d/v2/mini': { single: 'input_image_url' }, + 'fal-ai/hunyuan3d/v2/mini/turbo': { single: 'input_image_url' }, + 'fal-ai/hunyuan3d/v2/turbo': { single: 'input_image_url' }, + 'fal-ai/hy-wu-edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/hyper3d/rodin': { single: 'input_image_urls', multi: 'input_image_urls', reference: 'input_image_urls' }, + 'fal-ai/hyper3d/rodin/v2': { single: 'input_image_urls', multi: 'input_image_urls', reference: 'input_image_urls' }, + 'fal-ai/iclight-v2': { mask: 'mask_image_url' }, + 'fal-ai/ideogram/character': { single: 'image_urls' }, + 'fal-ai/ideogram/v3': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/ideogram/v3/edit': { reference: 'image_urls' }, + 'fal-ai/ideogram/v3/reframe': { reference: 'image_urls' }, + 'fal-ai/ideogram/v3/remix': { reference: 'image_urls' }, + 'fal-ai/ideogram/v3/replace-background': { reference: 'image_urls' }, + 'fal-ai/infinitalk': { start: 'image_url' }, + 'fal-ai/infinitalk/single-text': { start: 'image_url' }, + 'fal-ai/kandinsky5-pro/image-to-video': { start: 'image_url' }, + 'fal-ai/kling-image/o1': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/kling-image/o3/image-to-image': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/kling-video/ai-avatar/v2/pro': { start: 'image_url' }, + 'fal-ai/kling-video/ai-avatar/v2/standard': { start: 'image_url' }, + 'fal-ai/kling-video/o1/reference-to-video': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/kling-video/o1/standard/reference-to-video': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/kling-video/o1/standard/video-to-video/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/kling-video/o1/standard/video-to-video/reference': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/kling-video/o1/video-to-video/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/kling-video/o1/video-to-video/reference': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/kling-video/o3/pro/image-to-video': { start: 'image_url' }, + 'fal-ai/kling-video/o3/pro/reference-to-video': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/kling-video/o3/pro/video-to-video/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/kling-video/o3/pro/video-to-video/reference': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/kling-video/o3/standard/image-to-video': { start: 'image_url' }, + 'fal-ai/kling-video/o3/standard/reference-to-video': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/kling-video/o3/standard/video-to-video/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/kling-video/o3/standard/video-to-video/reference': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/kling-video/v1.5/pro/effects': { single: 'input_image_urls', multi: 'input_image_urls', reference: 'input_image_urls' }, + 'fal-ai/kling-video/v1.5/pro/image-to-video': { start: 'image_url', end: 'tail_image_url' }, + 'fal-ai/kling-video/v1.6/pro/effects': { single: 'input_image_urls', multi: 'input_image_urls', reference: 'input_image_urls' }, + 'fal-ai/kling-video/v1.6/pro/elements': { single: 'input_image_urls', multi: 'input_image_urls', reference: 'input_image_urls' }, + 'fal-ai/kling-video/v1.6/pro/image-to-video': { start: 'image_url', end: 'tail_image_url' }, + 'fal-ai/kling-video/v1.6/standard/effects': { single: 'input_image_urls', multi: 'input_image_urls', reference: 'input_image_urls' }, + 'fal-ai/kling-video/v1.6/standard/elements': { single: 'input_image_urls', multi: 'input_image_urls', reference: 'input_image_urls' }, + 'fal-ai/kling-video/v1.6/standard/image-to-video': { start: 'image_url' }, + 'fal-ai/kling-video/v1/pro/ai-avatar': { start: 'image_url' }, + 'fal-ai/kling-video/v1/standard/ai-avatar': { start: 'image_url' }, + 'fal-ai/kling-video/v1/standard/effects': { single: 'input_image_urls', multi: 'input_image_urls', reference: 'input_image_urls' }, + 'fal-ai/kling-video/v1/standard/image-to-video': { start: 'image_url', end: 'tail_image_url' }, + 'fal-ai/kling-video/v2.1/master/image-to-video': { start: 'image_url' }, + 'fal-ai/kling-video/v2.1/standard/image-to-video': { start: 'image_url' }, + 'fal-ai/kling-video/v2.5-turbo/pro/image-to-video': { start: 'image_url', end: 'tail_image_url' }, + 'fal-ai/kling-video/v2.5-turbo/standard/image-to-video': { start: 'image_url' }, + 'fal-ai/kling-video/v2.6/pro/motion-control': { start: 'image_url' }, + 'fal-ai/kling-video/v2.6/standard/motion-control': { start: 'image_url' }, + 'fal-ai/kling-video/v2/master/image-to-video': { start: 'image_url' }, + 'fal-ai/kling-video/v3/pro/motion-control': { start: 'image_url' }, + 'fal-ai/kling-video/v3/standard/motion-control': { start: 'image_url' }, + 'fal-ai/live-avatar': { start: 'image_url' }, + 'fal-ai/live-portrait': { start: 'image_url' }, + 'fal-ai/longcat-multi-avatar/image-audio-to-video': { start: 'image_url' }, + 'fal-ai/longcat-single-avatar/image-audio-to-video': { start: 'image_url' }, + 'fal-ai/longcat-video/distilled/image-to-video/480p': { start: 'image_url' }, + 'fal-ai/longcat-video/distilled/image-to-video/720p': { start: 'image_url' }, + 'fal-ai/longcat-video/image-to-video/480p': { start: 'image_url' }, + 'fal-ai/longcat-video/image-to-video/720p': { start: 'image_url' }, + 'fal-ai/ltx-2-19b/audio-to-video': { start: 'image_url' }, + 'fal-ai/ltx-2-19b/audio-to-video/lora': { start: 'image_url' }, + 'fal-ai/ltx-2-19b/distilled/audio-to-video': { start: 'image_url' }, + 'fal-ai/ltx-2-19b/distilled/audio-to-video/lora': { start: 'image_url' }, + 'fal-ai/ltx-2-19b/distilled/image-to-video': { start: 'image_url' }, + 'fal-ai/ltx-2-19b/distilled/video-to-video': { start: 'image_url' }, + 'fal-ai/ltx-2-19b/distilled/video-to-video/lora': { start: 'image_url' }, + 'fal-ai/ltx-2-19b/image-to-video': { start: 'image_url' }, + 'fal-ai/ltx-2-19b/image-to-video/lora': { start: 'image_url' }, + 'fal-ai/ltx-2-19b/video-to-video': { start: 'image_url' }, + 'fal-ai/ltx-2-19b/video-to-video/lora': { start: 'image_url' }, + 'fal-ai/ltx-2.3/audio-to-video': { start: 'image_url' }, + 'fal-ai/ltx-2.3/image-to-video': { start: 'image_url' }, + 'fal-ai/ltx-2.3/image-to-video/fast': { start: 'image_url' }, + 'fal-ai/ltx-2/audio-to-video': { start: 'image_url' }, + 'fal-ai/ltx-2/image-to-video': { start: 'image_url' }, + 'fal-ai/ltx-2/image-to-video/fast': { start: 'image_url' }, + 'fal-ai/ltx-video-13b-dev/image-to-video': { start: 'image_url' }, + 'fal-ai/ltx-video-13b-distilled/image-to-video': { start: 'image_url' }, + 'fal-ai/ltx-video-lora/image-to-video': { start: 'image_url' }, + 'fal-ai/ltx-video-v095/image-to-video': { start: 'image_url' }, + 'fal-ai/ltx-video-v097/image-to-video': { start: 'image_url' }, + 'fal-ai/ltx-video/image-to-video': { start: 'image_url' }, + 'fal-ai/ltxv-13b-098-distilled/image-to-video': { start: 'image_url' }, + 'fal-ai/luma-dream-machine/ray-2-flash/image-to-video': { start: 'image_url' }, + 'fal-ai/luma-dream-machine/ray-2-flash/modify': { start: 'image_url' }, + 'fal-ai/luma-dream-machine/ray-2-flash/reframe': { start: 'image_url' }, + 'fal-ai/luma-dream-machine/ray-2/image-to-video': { start: 'image_url' }, + 'fal-ai/luma-dream-machine/ray-2/reframe': { start: 'image_url' }, + 'fal-ai/magi-distilled/image-to-video': { start: 'image_url' }, + 'fal-ai/magi/image-to-video': { start: 'image_url' }, + 'fal-ai/meshy/v5/multi-image-to-3d': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/minimax/hailuo-02-fast/image-to-video': { start: 'image_url' }, + 'fal-ai/minimax/hailuo-02/pro/image-to-video': { start: 'image_url' }, + 'fal-ai/minimax/hailuo-02/standard/image-to-video': { start: 'image_url' }, + 'fal-ai/minimax/hailuo-2.3-fast/pro/image-to-video': { start: 'image_url' }, + 'fal-ai/minimax/hailuo-2.3-fast/standard/image-to-video': { start: 'image_url' }, + 'fal-ai/minimax/hailuo-2.3/pro/image-to-video': { start: 'image_url' }, + 'fal-ai/minimax/hailuo-2.3/standard/image-to-video': { start: 'image_url' }, + 'fal-ai/minimax/video-01-director/image-to-video': { start: 'image_url' }, + 'fal-ai/minimax/video-01-live/image-to-video': { start: 'image_url' }, + 'fal-ai/minimax/video-01/image-to-video': { start: 'image_url' }, + 'fal-ai/muse-pose': { start: 'image_url' }, + 'fal-ai/nano-banana-2/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/nano-banana-pro/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/nano-banana/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/omnigen-v1': { single: 'input_image_urls', multi: 'input_image_urls', reference: 'input_image_urls' }, + 'fal-ai/omnigen-v2': { single: 'input_image_urls', multi: 'input_image_urls', reference: 'input_image_urls' }, + 'fal-ai/omnipart': { single: 'input_image_url' }, + 'fal-ai/phota/create-profile': { single: 'image_data_url' }, + 'fal-ai/phota/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/pika/v1.5/pikaffects': { start: 'image_url' }, + 'fal-ai/pika/v2.1/image-to-video': { start: 'image_url' }, + 'fal-ai/pika/v2.2/image-to-video': { start: 'image_url' }, + 'fal-ai/pika/v2.2/pikaframes': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/pika/v2/pikadditions': { start: 'image_url' }, + 'fal-ai/pika/v2/turbo/image-to-video': { start: 'image_url' }, + 'fal-ai/pixverse/swap': { start: 'image_url' }, + 'fal-ai/pixverse/v3.5/image-to-video': { start: 'image_url' }, + 'fal-ai/pixverse/v4.5/effects': { start: 'image_url' }, + 'fal-ai/pixverse/v4.5/image-to-video': { start: 'image_url' }, + 'fal-ai/pixverse/v4.5/image-to-video/fast': { start: 'image_url' }, + 'fal-ai/pixverse/v4.5/transition': { start: 'first_image_url' }, + 'fal-ai/pixverse/v4/effects': { start: 'image_url' }, + 'fal-ai/pixverse/v4/image-to-video': { start: 'image_url' }, + 'fal-ai/pixverse/v4/image-to-video/fast': { start: 'image_url' }, + 'fal-ai/pixverse/v5.5/effects': { start: 'image_url' }, + 'fal-ai/pixverse/v5.5/image-to-video': { start: 'image_url' }, + 'fal-ai/pixverse/v5.5/transition': { start: 'first_image_url' }, + 'fal-ai/pixverse/v5.6/image-to-video': { start: 'image_url' }, + 'fal-ai/pixverse/v5.6/transition': { start: 'first_image_url' }, + 'fal-ai/pixverse/v5/effects': { start: 'image_url' }, + 'fal-ai/pixverse/v5/image-to-video': { start: 'image_url' }, + 'fal-ai/pixverse/v5/transition': { start: 'first_image_url' }, + 'fal-ai/pixverse/v6/image-to-video': { start: 'image_url' }, + 'fal-ai/pixverse/v6/transition': { start: 'first_image_url' }, + 'fal-ai/qwen-image-2/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-2/pro/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-2512-trainer': { single: 'image_data_url' }, + 'fal-ai/qwen-image-2512-trainer-v2': { single: 'image_data_url' }, + 'fal-ai/qwen-image-edit-2509': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-2509-lora': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-2509-lora-gallery/add-background': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-2509-lora-gallery/face-to-full-portrait': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-2509-lora-gallery/group-photo': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-2509-lora-gallery/integrate-product': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-2509-lora-gallery/lighting-restoration': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-2509-lora-gallery/multiple-angles': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-2509-lora-gallery/next-scene': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-2509-lora-gallery/remove-element': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-2509-lora-gallery/remove-lighting': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-2509-lora-gallery/shirt-design': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-2509-trainer': { single: 'image_data_url' }, + 'fal-ai/qwen-image-edit-2511': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-2511-multiple-angles': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-2511-trainer': { single: 'image_data_url' }, + 'fal-ai/qwen-image-edit-2511/lora': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-plus': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-plus-lora': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-plus-lora-gallery/add-background': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-plus-lora-gallery/face-to-full-portrait': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-plus-lora-gallery/group-photo': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-plus-lora-gallery/integrate-product': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-plus-lora-gallery/lighting-restoration': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-plus-lora-gallery/multiple-angles': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-plus-lora-gallery/next-scene': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-plus-lora-gallery/remove-element': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-plus-lora-gallery/remove-lighting': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-plus-lora-gallery/shirt-design': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-edit-plus-trainer': { single: 'image_data_url' }, + 'fal-ai/qwen-image-edit-trainer': { single: 'image_data_url' }, + 'fal-ai/qwen-image-layered-trainer': { single: 'image_data_url' }, + 'fal-ai/qwen-image-max/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/qwen-image-trainer': { single: 'image_data_url' }, + 'fal-ai/qwen-image-trainer-v2': { single: 'image_data_url' }, + 'fal-ai/scail': { start: 'image_url' }, + 'fal-ai/skyreels-i2v': { start: 'image_url' }, + 'fal-ai/sora-2/image-to-video': { start: 'image_url' }, + 'fal-ai/sora-2/image-to-video/pro': { start: 'image_url' }, + 'fal-ai/stable-avatar': { start: 'image_url' }, + 'fal-ai/stable-video': { start: 'image_url' }, + 'fal-ai/steady-dancer': { start: 'image_url' }, + 'fal-ai/trellis/multi': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/uno': { single: 'input_image_urls', multi: 'input_image_urls', reference: 'input_image_urls' }, + 'fal-ai/uso': { single: 'input_image_urls', multi: 'input_image_urls', reference: 'input_image_urls' }, + 'fal-ai/vecglypher/image-to-svg': { multi: 'reference_image_urls' }, + 'fal-ai/veo2/image-to-video': { start: 'image_url' }, + 'fal-ai/veo3.1/fast/first-last-frame-to-video': { start: 'first_frame_url', end: 'last_frame_url' }, + 'fal-ai/veo3.1/fast/image-to-video': { start: 'image_url' }, + 'fal-ai/veo3.1/first-last-frame-to-video': { start: 'first_frame_url', end: 'last_frame_url' }, + 'fal-ai/veo3.1/image-to-video': { start: 'image_url' }, + 'fal-ai/veo3.1/lite/first-last-frame-to-video': { start: 'first_frame_url', end: 'last_frame_url' }, + 'fal-ai/veo3.1/lite/image-to-video': { start: 'image_url' }, + 'fal-ai/veo3.1/reference-to-video': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/veo3/fast/image-to-video': { start: 'image_url' }, + 'fal-ai/veo3/image-to-video': { start: 'image_url' }, + 'fal-ai/video-as-prompt': { start: 'image_url' }, + 'fal-ai/vidu/image-to-video': { start: 'image_url' }, + 'fal-ai/vidu/q1/image-to-video': { start: 'image_url' }, + 'fal-ai/vidu/q1/reference-to-video': { multi: 'reference_image_urls' }, + 'fal-ai/vidu/q2/image-to-video/pro': { start: 'image_url' }, + 'fal-ai/vidu/q2/image-to-video/turbo': { start: 'image_url' }, + 'fal-ai/vidu/q2/reference-to-image': { multi: 'reference_image_urls' }, + 'fal-ai/vidu/q2/reference-to-video/pro': { multi: 'reference_image_urls' }, + 'fal-ai/vidu/q3/image-to-video': { start: 'image_url' }, + 'fal-ai/vidu/q3/image-to-video/turbo': { start: 'image_url' }, + 'fal-ai/vidu/reference-to-image': { multi: 'reference_image_urls' }, + 'fal-ai/vidu/reference-to-video': { multi: 'reference_image_urls' }, + 'fal-ai/vidu/template-to-video': { single: 'input_image_urls', multi: 'input_image_urls', reference: 'input_image_urls' }, + 'fal-ai/wan-22-vace-fun-a14b/depth': { multi: 'ref_image_urls', reference: 'ref_image_urls', start: 'first_frame_url', end: 'last_frame_url' }, + 'fal-ai/wan-22-vace-fun-a14b/inpainting': { multi: 'ref_image_urls', mask: 'mask_image_url', reference: 'ref_image_urls', start: 'first_frame_url', end: 'last_frame_url' }, + 'fal-ai/wan-22-vace-fun-a14b/outpainting': { multi: 'ref_image_urls', reference: 'ref_image_urls', start: 'first_frame_url', end: 'last_frame_url' }, + 'fal-ai/wan-22-vace-fun-a14b/pose': { multi: 'ref_image_urls', reference: 'ref_image_urls', start: 'first_frame_url', end: 'last_frame_url' }, + 'fal-ai/wan-22-vace-fun-a14b/reframe': { start: 'first_frame_url', end: 'last_frame_url' }, + 'fal-ai/wan-25-preview/image-to-image': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/wan-25-preview/image-to-video': { start: 'image_url' }, + 'fal-ai/wan-ati': { start: 'image_url' }, + 'fal-ai/wan-effects': { start: 'image_url' }, + 'fal-ai/wan-fun-control': { reference: 'reference_image_url' }, + 'fal-ai/wan-i2v': { start: 'image_url' }, + 'fal-ai/wan-i2v-lora': { start: 'image_url' }, + 'fal-ai/wan-motion': { start: 'image_url' }, + 'fal-ai/wan-move': { start: 'image_url' }, + 'fal-ai/wan-pro/image-to-video': { start: 'image_url' }, + 'fal-ai/wan-vace': { multi: 'ref_image_urls', mask: 'mask_image_url', reference: 'ref_image_urls' }, + 'fal-ai/wan-vace-1-3b': { multi: 'ref_image_urls', mask: 'mask_image_url', reference: 'ref_image_urls' }, + 'fal-ai/wan-vace-14b': { multi: 'ref_image_urls', mask: 'mask_image_url', reference: 'ref_image_urls', start: 'first_frame_url', end: 'last_frame_url' }, + 'fal-ai/wan-vace-14b/depth': { multi: 'ref_image_urls', reference: 'ref_image_urls', start: 'first_frame_url', end: 'last_frame_url' }, + 'fal-ai/wan-vace-14b/inpainting': { multi: 'ref_image_urls', mask: 'mask_image_url', reference: 'ref_image_urls', start: 'first_frame_url', end: 'last_frame_url' }, + 'fal-ai/wan-vace-14b/outpainting': { multi: 'ref_image_urls', reference: 'ref_image_urls', start: 'first_frame_url', end: 'last_frame_url' }, + 'fal-ai/wan-vace-14b/pose': { multi: 'ref_image_urls', reference: 'ref_image_urls', start: 'first_frame_url', end: 'last_frame_url' }, + 'fal-ai/wan-vace-14b/reframe': { start: 'first_frame_url', end: 'last_frame_url' }, + 'fal-ai/wan-vace-apps/video-edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/wan/v2.2-14b/animate/move': { start: 'image_url' }, + 'fal-ai/wan/v2.2-14b/animate/replace': { start: 'image_url' }, + 'fal-ai/wan/v2.2-14b/speech-to-video': { start: 'image_url' }, + 'fal-ai/wan/v2.2-5b/image-to-video': { start: 'image_url' }, + 'fal-ai/wan/v2.2-a14b/image-to-video/lora': { start: 'image_url' }, + 'fal-ai/wan/v2.2-a14b/image-to-video/turbo': { start: 'image_url' }, + 'fal-ai/wan/v2.7/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/wan/v2.7/edit-video': { reference: 'reference_image_url' }, + 'fal-ai/wan/v2.7/image-to-video': { start: 'image_url' }, + 'fal-ai/wan/v2.7/pro/edit': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/wan/v2.7/reference-to-video': { multi: 'reference_image_urls' }, + 'fal-ai/x-ailab/nsfw': { single: 'image_urls', reference: 'image_urls' }, + 'fal-ai/z-image-base-trainer': { single: 'image_data_url' }, + 'fal-ai/z-image-trainer': { single: 'image_data_url' }, + 'fal-ai/z-image-turbo-trainer-v2': { single: 'image_data_url' }, + 'fal-ai/z-image/turbo/inpaint': { mask: 'mask_image_url' }, + 'fal-ai/z-image/turbo/inpaint/lora': { mask: 'mask_image_url' }, + 'fal-ai/z-image/turbo/tiling': { mask: 'mask_image_url' }, + 'fal-ai/z-image/turbo/tiling/lora': { mask: 'mask_image_url' }, + 'moonvalley/marey/i2v': { start: 'image_url' }, + 'moonvalley/marey/motion-transfer': { reference: 'reference_image_url' }, + 'moonvalley/marey/pose-transfer': { reference: 'reference_image_url' }, + 'openrouter/router/vision': { single: 'image_urls', reference: 'image_urls' }, + 'veed/fabric-1.0': { start: 'image_url' }, + 'veed/fabric-1.0/fast': { start: 'image_url' }, + 'veed/fabric-1.0/text': { start: 'image_url' }, + 'wan/v2.6/image-to-image': { single: 'image_urls', reference: 'image_urls' }, + 'wan/v2.6/image-to-video': { start: 'image_url' }, + 'wan/v2.6/image-to-video/flash': { start: 'image_url' }, + 'wan/v2.6/reference-to-video/flash': { single: 'image_urls', reference: 'image_urls' }, + 'xai/grok-imagine-image/edit': { single: 'image_urls', reference: 'image_urls' }, + 'xai/grok-imagine-video/image-to-video': { start: 'image_url' }, + 'xai/grok-imagine-video/reference-to-video': { multi: 'reference_image_urls' }, +} as const satisfies { + [K in keyof EndpointTypeMap]?: { + [Role in keyof FalImageFieldOverride]?: InputFieldOf + } +} diff --git a/scripts/generate-fal-image-field-map.ts b/scripts/generate-fal-image-field-map.ts new file mode 100644 index 000000000..a128c9574 --- /dev/null +++ b/scripts/generate-fal-image-field-map.ts @@ -0,0 +1,359 @@ +/** + * Generates the fal image-conditioning field-override map from the + * `EndpointTypeMap` types shipped with `@fal-ai/client`. + * + * fal endpoints use inconsistent field names for image-conditioned + * generation (`image_url` vs `image_urls` vs `first_frame_url` vs + * `mask_image_url`, ...). The runtime mapper in + * `packages/typescript/ai-fal/src/image/image-inputs.ts` applies a default + * field per input role; this script walks every endpoint's input type with + * the TypeScript checker and records, per role, the field the endpoint + * actually accepts whenever it differs from that default. Endpoints that + * match the defaults (the vast majority) are omitted, keeping the shipped + * artifact small. + * + * The emitted file type-checks each recorded field name against + * `EndpointTypeMap` via `satisfies` (a type-only import, erased at runtime), + * so a fal SDK bump that renames a field fails `tsc` until this script is + * re-run. A unit test compares the recorded endpoints.d.ts hash against the + * installed SDK to catch staleness. + * + * Usage: + * pnpm tsx scripts/generate-fal-image-field-map.ts # regenerate + * pnpm tsx scripts/generate-fal-image-field-map.ts --check # CI staleness check + */ + +import { createHash } from 'node:crypto' +import { readFileSync, writeFileSync } from 'node:fs' +import { dirname, resolve } from 'node:path' +import { fileURLToPath } from 'node:url' +import ts from 'typescript' + +const __dirname = dirname(fileURLToPath(import.meta.url)) +const ROOT = resolve(__dirname, '..') +const FAL_PKG = resolve(ROOT, 'packages/typescript/ai-fal') +const ENDPOINTS_DTS = resolve( + FAL_PKG, + 'node_modules/@fal-ai/client/src/types/endpoints.d.ts', +) +const CLIENT_PKG_JSON = resolve( + FAL_PKG, + 'node_modules/@fal-ai/client/package.json', +) +const OUT_FILE = resolve( + FAL_PKG, + 'src/image/generated/image-field-overrides.ts', +) + +// --------------------------------------------------------------------------- +// Role classification +// --------------------------------------------------------------------------- + +/** + * Routing roles used by the runtime mapper. `single` / `multi` cover unroled + * source images; the rest correspond to `MediaInputRole` values. + */ +type RoleKey = + | 'single' + | 'multi' + | 'mask' + | 'control' + | 'reference' + | 'start' + | 'end' + +/** + * Default field per role — must stay in sync with `DEFAULT_FIELDS` in + * image-inputs.ts. An override is only emitted when the chosen candidate + * differs from this default. + */ +const DEFAULTS: Record = { + single: 'image_url', + multi: 'image_urls', + mask: 'mask_url', + control: 'control_image_url', + reference: 'reference_image_urls', + start: 'start_image_url', + end: 'end_image_url', +} + +/** + * Candidate fields per role, in priority order. The first candidate present + * on the endpoint's input type wins. Names here are deliberately + * conservative: only fields whose semantics unambiguously match the role. + * + * `start` / `end` are only consumed by the video mapper (the image mapper + * treats those roles as plain sources), so they are only computed for + * endpoints whose output contains video — that's also why `image_url` is a + * valid `start` candidate: on image-to-video endpoints the source image IS + * the start frame. + */ +const CANDIDATES: Record> = { + single: [ + 'image_url', + 'input_image_url', + 'image_data_url', + 'image_urls', + 'input_image_urls', + ], + multi: [ + 'image_urls', + 'input_image_urls', + 'ref_image_urls', + 'reference_image_urls', + ], + mask: ['mask_url', 'mask_image_url'], + control: ['control_image_url'], + reference: [ + 'reference_image_urls', + 'ref_image_urls', + 'reference_image_url', + 'image_urls', + 'input_image_urls', + ], + start: ['start_image_url', 'first_frame_url', 'first_image_url', 'image_url'], + end: ['end_image_url', 'last_frame_url', 'last_image_url', 'tail_image_url'], +} + +/** Roles only meaningful for endpoints that produce video. */ +const VIDEO_ONLY_ROLES = new Set(['start', 'end']) + +/** + * Fields that take an array of images. The runtime mapper wraps/refuses + * values based on this same set (`LIST_FIELDS` in image-inputs.ts); the + * generator asserts the actual types agree so the two never drift. + */ +const LIST_FIELDS = new Set([ + 'image_urls', + 'input_image_urls', + 'ref_image_urls', + 'reference_image_urls', +]) + +// --------------------------------------------------------------------------- +// Type extraction +// --------------------------------------------------------------------------- + +interface EndpointFields { + /** All input field names for this endpoint */ + fields: Set + /** Field name -> whether the field accepts an array */ + isList: Map + /** Whether the endpoint's output contains video */ + producesVideo: boolean +} + +function extractEndpointInputs(): Map { + const program = ts.createProgram([ENDPOINTS_DTS], { + target: ts.ScriptTarget.ES2022, + skipLibCheck: true, + }) + const checker = program.getTypeChecker() + const source = program.getSourceFile(ENDPOINTS_DTS) + if (!source) throw new Error(`Could not load ${ENDPOINTS_DTS}`) + + let mapType: ts.Type | undefined + source.forEachChild((node) => { + if ( + ts.isTypeAliasDeclaration(node) && + node.name.text === 'EndpointTypeMap' + ) { + mapType = checker.getTypeAtLocation(node.name) + } + }) + if (!mapType) throw new Error('EndpointTypeMap not found in endpoints.d.ts') + + const endpoints = new Map() + for (const endpoint of mapType.getProperties()) { + const endpointType = checker.getTypeOfSymbol(endpoint) + const inputSymbol = endpointType.getProperty('input') + if (!inputSymbol) continue + const inputType = checker.getTypeOfSymbol(inputSymbol) + + const fields = new Set() + const isList = new Map() + for (const field of inputType.getProperties()) { + const name = field.getName() + fields.add(name) + const fieldType = checker.getTypeOfSymbol(field) + isList.set(name, typeAcceptsArray(checker, fieldType)) + } + + const outputSymbol = endpointType.getProperty('output') + const producesVideo = outputSymbol + ? checker + .getTypeOfSymbol(outputSymbol) + .getProperties() + .some((p) => p.getName() === 'video' || p.getName() === 'videos') + : false + + endpoints.set(endpoint.getName(), { fields, isList, producesVideo }) + } + return endpoints +} + +function typeAcceptsArray(checker: ts.TypeChecker, type: ts.Type): boolean { + const parts = type.isUnion() ? type.types : [type] + return parts.some((part) => checker.isArrayLikeType(part)) +} + +// --------------------------------------------------------------------------- +// Override computation +// --------------------------------------------------------------------------- + +const ROLE_ORDER: Array = [ + 'single', + 'multi', + 'mask', + 'control', + 'reference', + 'start', + 'end', +] + +function computeOverrides( + endpoints: Map, +): Map>> { + const overrides = new Map>>() + + for (const [endpointId, { fields, isList, producesVideo }] of endpoints) { + const entry: Partial> = {} + for (const role of ROLE_ORDER) { + if (VIDEO_ONLY_ROLES.has(role) && !producesVideo) continue + const chosen = CANDIDATES[role].find((candidate) => fields.has(candidate)) + if (!chosen || chosen === DEFAULTS[role]) continue + + // Arity sanity check: the runtime mapper decides array-wrapping from + // the static LIST_FIELDS set, so the actual type must agree. + const actualIsList = isList.get(chosen) ?? false + const assumedIsList = LIST_FIELDS.has(chosen) + if (actualIsList !== assumedIsList) { + throw new Error( + `Arity mismatch for ${endpointId}.${chosen}: type says ` + + `${actualIsList ? 'array' : 'scalar'} but LIST_FIELDS assumes ` + + `${assumedIsList ? 'array' : 'scalar'}. Update LIST_FIELDS here ` + + `and LIST_FIELDS in image-inputs.ts.`, + ) + } + entry[role] = chosen + } + if (Object.keys(entry).length > 0) overrides.set(endpointId, entry) + } + return overrides +} + +// --------------------------------------------------------------------------- +// Emission +// --------------------------------------------------------------------------- + +function render( + overrides: Map>>, +): string { + const clientVersion = ( + JSON.parse(readFileSync(CLIENT_PKG_JSON, 'utf8')) as { version: string } + ).version + const dtsHash = createHash('sha256') + .update(readFileSync(ENDPOINTS_DTS)) + .digest('hex') + + const sortedIds = [...overrides.keys()].sort() + const entries = sortedIds + .map((id) => { + const entry = overrides.get(id)! + const pairs = ROLE_ORDER.filter((role) => entry[role]).map( + (role) => `${role}: '${entry[role]}'`, + ) + return ` '${id}': { ${pairs.join(', ')} },` + }) + .join('\n') + + // Union of every field name the runtime mapper may emit: the per-role + // defaults plus every field referenced by an override. + const fieldNames = new Set(Object.values(DEFAULTS)) + for (const entry of overrides.values()) { + for (const field of Object.values(entry)) fieldNames.add(field) + } + const fieldNameUnion = [...fieldNames] + .sort() + .map((name) => ` | '${name}'`) + .join('\n') + + return `/* eslint-disable */ +// --------------------------------------------------------------------------- +// AUTO-GENERATED — do not edit by hand. +// +// Generated from @fal-ai/client@${clientVersion} EndpointTypeMap by +// scripts/generate-fal-image-field-map.ts. Regenerate after bumping +// @fal-ai/client: +// +// pnpm tsx scripts/generate-fal-image-field-map.ts +// +// Maps fal endpoint ids to the image-conditioning input fields they accept +// whenever those differ from the defaults in image-inputs.ts. Endpoints +// matching the defaults are omitted. The \`satisfies\` clause below checks +// every field name against the SDK's endpoint input types at compile time +// (type-only import — nothing from endpoints.d.ts is shipped at runtime). +// --------------------------------------------------------------------------- +import type { EndpointTypeMap } from '@fal-ai/client/endpoints' + +/** sha256 of the endpoints.d.ts this file was generated from. */ +export const FAL_ENDPOINTS_DTS_SHA256 = + '${dtsHash}' + +/** Every input field name the image-input mappers may emit. */ +export type FalImageFieldName = +${fieldNameUnion} + +/** + * Per-role input-field overrides. Roles: \`single\` / \`multi\` route unroled + * source images; the rest mirror \`MediaInputRole\` (\`start\` / \`end\` map the + * \`start_frame\` / \`end_frame\` roles). + */ +export interface FalImageFieldOverride { + single?: string + multi?: string + mask?: string + control?: string + reference?: string + start?: string + end?: string +} + +type InputFieldOf = Extract< + keyof EndpointTypeMap[K]['input'], + string +> + +export const FAL_IMAGE_FIELD_OVERRIDES = { +${entries} +} as const satisfies { + [K in keyof EndpointTypeMap]?: { + [Role in keyof FalImageFieldOverride]?: InputFieldOf + } +} +` +} + +// --------------------------------------------------------------------------- +// Main +// --------------------------------------------------------------------------- + +const endpoints = extractEndpointInputs() +const overrides = computeOverrides(endpoints) +const output = render(overrides) + +if (process.argv.includes('--check')) { + const current = readFileSync(OUT_FILE, 'utf8') + if (current !== output) { + console.error( + 'image-field-overrides.ts is stale. Run: pnpm tsx scripts/generate-fal-image-field-map.ts', + ) + process.exit(1) + } + console.log('image-field-overrides.ts is up to date.') +} else { + writeFileSync(OUT_FILE, output) + console.log( + `Wrote ${overrides.size} endpoint overrides (of ${endpoints.size} endpoints) to ${OUT_FILE}`, + ) +} From 389a1a0ff603ad01f5beb68bed1d3a9a4d55d9a5 Mon Sep 17 00:00:00 2001 From: Tom Beckenham <34339192+tombeckenham@users.noreply.github.com> Date: Fri, 5 Jun 2026 18:01:56 +1000 Subject: [PATCH 04/11] feat(ai-grok,ai-openrouter): support imageInputs for image-conditioned generation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Grok: add the xAI Imagine API image models (grok-imagine-image, grok-imagine-image-quality) to model-meta. With imageInputs they route to xAI's JSON POST /v1/images/edits endpoint via direct fetch (the OpenAI SDK's images.edit() sends multipart/form-data, which xAI rejects) — a single input as image:{url}, 2-3 inputs as images:[...] referenceable in the prompt as /; >3 inputs and mask/control roles throw. Their generic `size` uses an aspectRatio_resolution template ('16:9_2k', suffix optional), mirroring Gemini's native image models, and maps to the Imagine aspect_ratio/resolution parameters on both the generate and edit paths. grok-2-image-1212 stays text-to-image only with a clear error. OpenRouter: imageInputs are injected as multimodal image_url content parts alongside the prompt in the chat-completions message and forwarded to the underlying image model. Neither path fetches or base64-encodes URL sources in-process — URLs pass through verbatim and are fetched by the provider; data sources become data URIs. Bumps ai-grok and ai-openrouter to minor in the existing changeset. Co-Authored-By: Claude Opus 4.8 (1M context) --- .changeset/image-and-video-inputs.md | 7 +- docs/adapters/grok.md | 39 ++++ docs/media/image-generation.md | 4 +- packages/ai-grok/src/adapters/image.ts | 190 ++++++++++++++++-- .../src/image/image-provider-options.ts | 123 ++++++++++++ packages/ai-grok/src/model-meta.ts | 39 +++- packages/ai-grok/tests/grok-adapter.test.ts | 181 +++++++++++++++++ packages/ai-openrouter/src/adapters/image.ts | 41 +++- .../ai-openrouter/tests/image-adapter.test.ts | 82 ++++++++ .../skills/ai-core/media-generation/SKILL.md | 26 +-- testing/e2e/src/lib/feature-support.ts | 3 +- 11 files changed, 691 insertions(+), 44 deletions(-) diff --git a/.changeset/image-and-video-inputs.md b/.changeset/image-and-video-inputs.md index b91528002..071dcb2e7 100644 --- a/.changeset/image-and-video-inputs.md +++ b/.changeset/image-and-video-inputs.md @@ -3,8 +3,8 @@ '@tanstack/ai-openai': minor '@tanstack/ai-gemini': minor '@tanstack/ai-fal': minor -'@tanstack/ai-grok': patch -'@tanstack/ai-openrouter': patch +'@tanstack/ai-grok': minor +'@tanstack/ai-openrouter': minor '@tanstack/ai-event-client': patch --- @@ -16,7 +16,8 @@ Provider behavior in this release: - **OpenAI video** — Sora-2 / Sora-2-Pro accept a single `input_reference` image; passing more than one throws. - **Gemini image** — Native models (`gemini-*-flash-image`, "nano-banana") receive inputs as multimodal parts in `contents`. Imagen throws (text-only). - **fal.ai** — Field names resolve per endpoint from a map generated from the fal SDK's endpoint types (362 endpoints with nonstandard fields, e.g. nano-banana edit → `image_urls`, Kling i2v start frame → `image_url`, Veo first-last-frame → `first_frame_url` / `last_frame_url`). Defaults for endpoints not in the map: single → `image_url`, multiple → `image_urls`; `role: 'mask'` → `mask_url`; `role: 'control'` → `control_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`; video `role: 'start_frame'` / `'end_frame'` → `start_image_url` / `end_image_url`. Regenerate the map after a fal SDK bump with `pnpm generate:fal-image-fields` (a unit test fails when it goes stale). -- **Grok**, **OpenRouter** — Throw with a link to issue #618 (full support pending dedicated Imagine / multimodal injection work). +- **Grok** — New `grok-imagine-image` / `grok-imagine-image-quality` models. With `imageInputs`, they route to xAI's JSON `/v1/images/edits` endpoint (up to 3 source images, referenceable as ``, `` in the prompt; `role: 'mask'` / `'control'` throw). Their `size` uses an `aspectRatio_resolution` template (`'16:9_2k'`, suffix optional) mirroring Gemini's native image models. `grok-2-image-1212` remains text-to-image only and throws on `imageInputs`. +- **OpenRouter** — `imageInputs` are injected as multimodal `image_url` content parts alongside the prompt and forwarded to the underlying image model. URL sources pass through verbatim (no fetching or re-encoding in your process); `data` sources become data URIs. - **Anthropic** — Unchanged (no image generation API). Closes #618. diff --git a/docs/adapters/grok.md b/docs/adapters/grok.md index 528226903..7c0ef33d2 100644 --- a/docs/adapters/grok.md +++ b/docs/adapters/grok.md @@ -160,6 +160,45 @@ const result = await generateImage({ console.log(result.images); ``` +The grok-imagine models (`grok-imagine-image`, `grok-imagine-image-quality`) +are aspect-ratio sized — `size` takes an `aspectRatio_resolution` template +like `"16:9_2k"` (the `_2k` suffix is optional): + +```typescript +const result = await generateImage({ + adapter: grokImage("grok-imagine-image"), + prompt: "A futuristic cityscape at sunset", + size: "16:9_2k", +}); +``` + +### Image Editing (image-to-image) + +The grok-imagine models accept `imageInputs` for image-conditioned +generation via xAI's `/v1/images/edits` endpoint — up to 3 source images, +referenceable in the prompt as ``, ``: + +```typescript +const result = await generateImage({ + adapter: grokImage("grok-imagine-image"), + prompt: "Render in the style of ", + imageInputs: [ + { + type: "image", + source: { type: "url", value: "https://example.com/product.png" }, + }, + { + type: "image", + source: { type: "url", value: "https://example.com/style.png" }, + }, + ], +}); +``` + +URL sources are fetched by xAI's servers, so they must be publicly +reachable; use a `data` source for private images. `grok-2-image-1212` is +text-to-image only and throws when `imageInputs` is passed. + ## Text-to-Speech Generate speech with Grok TTS: diff --git a/docs/media/image-generation.md b/docs/media/image-generation.md index c2f5d34f2..85174a820 100644 --- a/docs/media/image-generation.md +++ b/docs/media/image-generation.md @@ -234,8 +234,8 @@ await generateImage({ | **OpenAI** | `gpt-image-1` / `gpt-image-1-mini` → routes to `images.edit()`, up to 16 source images plus optional mask.
`dall-e-2` → `images.edit()` with 1 source image only.
`dall-e-3` → throws (no edit support). | | **Gemini** | Native models (`gemini-*-flash-image`, "nano-banana", etc.) → inputs become multimodal parts in `contents`. Up to ~14 input images.
Imagen models → throws (text-to-image only). | | **fal.ai** | Field names resolve per endpoint from a map generated from the fal SDK's endpoint types (e.g. nano-banana edit gets `image_urls`, Fooocus masks get `mask_image_url`). Defaults for unknown endpoints: 1 input → `image_url`; multiple → `image_urls`; `role: 'mask'` → `mask_url`; `role: 'control'` → `control_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`. Override with `modelOptions` for endpoint-specific fields. | -| **Grok** | Throws — the current adapter wraps Grok's OpenAI-compat endpoint, which doesn't expose image inputs. xAI's native Imagine API support is tracked as a follow-up. | -| **OpenRouter** | Throws — multimodal injection into the chat-completions pathway is tracked as a follow-up. | +| **Grok** | grok-imagine models → xAI's `/v1/images/edits` (up to 3 source images, referenceable as ``, `` in the prompt). `role: 'mask'` / `'control'` throw (no Imagine API equivalent). `grok-2-image-1212` throws (text-to-image only). | +| **OpenRouter** | Inputs are injected as multimodal `image_url` content parts alongside the prompt and forwarded to the underlying image model. | | **Anthropic** | n/a — no image generation API. | Adapters that don't support image-conditioned generation throw a clear diff --git a/packages/ai-grok/src/adapters/image.ts b/packages/ai-grok/src/adapters/image.ts index e5241103b..70d8e78b7 100644 --- a/packages/ai-grok/src/adapters/image.ts +++ b/packages/ai-grok/src/adapters/image.ts @@ -5,6 +5,8 @@ import { buildImagesUsage } from '@tanstack/openai-base' import { generateId } from '@tanstack/ai-utils' import { getGrokApiKeyFromEnv, withGrokDefaults } from '../utils/client' import { + isGrokImagineImageModel, + parseGrokImagineSize, validateImageSize, validateNumberOfImages, validatePrompt, @@ -13,6 +15,8 @@ import type { GeneratedImage, ImageGenerationOptions, ImageGenerationResult, + ImagePart, + MediaInputMetadata, } from '@tanstack/ai' import type OpenAI_SDK from 'openai' import type { GrokImageModel } from '../model-meta' @@ -28,15 +32,58 @@ import type { GrokClientConfig } from '../utils' */ export interface GrokImageConfig extends GrokClientConfig {} +/** Maximum source images accepted by xAI's image edit endpoint. */ +const MAX_EDIT_IMAGES = 3 + +/** + * Maps the generic `size` option onto Imagine API parameters: the + * "aspectRatio_resolution" template ("16:9_2k") splits into `aspect_ratio` + * and optional `resolution` request fields. + */ +function imagineSizeParams(size: string | undefined): { + aspect_ratio?: string + resolution?: string +} { + if (!size) return {} + const parsed = parseGrokImagineSize(size) + if (!parsed) return {} + return { + aspect_ratio: parsed.aspectRatio, + ...(parsed.resolution !== undefined && { resolution: parsed.resolution }), + } +} + +/** + * Convert a TanStack ImagePart to the URL string accepted by xAI's edit + * endpoint: public URLs pass through (fetched by xAI's servers), data + * sources become base64 data URIs. + */ +function imagePartToUrl(part: ImagePart): string { + if (part.source.type === 'url') return part.source.value + return `data:${part.source.mimeType};base64,${part.source.value}` +} + +/** Response shape of xAI's `/v1/images/edits` endpoint. */ +interface GrokImageEditResponse { + data?: Array<{ + url?: string | null + b64_json?: string | null + mime_type?: string + }> +} + /** * Grok Image Generation Adapter * * Tree-shakeable adapter for Grok image generation functionality. - * Supports grok-2-image-1212 model. + * Supports the legacy grok-2-image-1212 model (text-to-image via the + * OpenAI-compat endpoint) and the grok-imagine image models, which also + * accept `imageInputs` for image-conditioned generation via xAI's + * `/v1/images/edits` endpoint (up to 3 source images). * * Features: * - Model-specific type-safe provider options - * - Size validation per model + * - Size / aspect-ratio validation per model * - Number of images validation */ export class GrokImageAdapter< @@ -51,10 +98,12 @@ export class GrokImageAdapter< readonly name = 'grok' as const protected client: OpenAI + private readonly clientConfig: GrokImageConfig constructor(config: GrokImageConfig, model: TModel) { super(model, {}) - this.client = new OpenAI(withGrokDefaults(config)) + this.clientConfig = withGrokDefaults(config) + this.client = new OpenAI(this.clientConfig) } async generateImages( @@ -62,31 +111,44 @@ export class GrokImageAdapter< ): Promise { const { model, prompt, numberOfImages, size, modelOptions } = options - if ( - options.imageInputs?.length || - options.videoInputs?.length || - options.audioInputs?.length - ) { + if (options.videoInputs?.length || options.audioInputs?.length) { throw new Error( - `grok.generateImages does not yet support imageInputs / videoInputs / audioInputs. ` + - `Image-conditioned generation requires the xAI Imagine API, which the current adapter ` + - `does not target (it uses the OpenAI-compat endpoint). Track progress at https://github.com/TanStack/ai/issues/618.`, + `grok.generateImages does not support videoInputs / audioInputs on model ${model}.`, ) } + if (options.imageInputs?.length) { + if (!isGrokImagineImageModel(model)) { + throw new Error( + `grok: model "${model}" does not support imageInputs. ` + + `Image-conditioned generation requires an Imagine API model ` + + `('grok-imagine-image' or 'grok-imagine-image-quality').`, + ) + } + return await this.editImages(options) + } + validatePrompt({ prompt, model }) validateImageSize(model, size) validateNumberOfImages(model, numberOfImages) - const resolvedSize = size as OpenAI_SDK.Images.ImageGenerateParams['size'] - const request: OpenAI_SDK.Images.ImageGenerateParamsNonStreaming = { + // grok-imagine models are aspect-ratio sized: the generic `size` option + // carries an "aspectRatio_resolution" template (e.g. '16:9_2k', like + // Gemini native image models) and maps to the Imagine API's + // `aspect_ratio` / `resolution` parameters instead of OpenAI-style `size`. + const isImagine = isGrokImagineImageModel(model) + const request = { model, prompt, n: numberOfImages ?? 1, - ...(resolvedSize !== undefined && { size: resolvedSize }), + ...(isImagine + ? imagineSizeParams(size) + : size !== undefined && { + size: size as OpenAI_SDK.Images.ImageGenerateParams['size'], + }), stream: false, ...modelOptions, - } + } as OpenAI_SDK.Images.ImageGenerateParamsNonStreaming try { options.logger.request( @@ -134,6 +196,104 @@ export class GrokImageAdapter< throw error } } + + /** + * Image-conditioned generation via xAI's Imagine API. + * + * The `/v1/images/edits` endpoint takes `application/json` (the OpenAI + * SDK's `images.edit()` sends `multipart/form-data`, which xAI rejects), + * so this path issues the request directly. One input is sent as + * `image: { url }`; multiple inputs (up to 3) as `images: [{ url }, ...]`, + * referenceable in the prompt as ``, ``, ... + */ + private async editImages( + options: ImageGenerationOptions, + ): Promise { + const { model, prompt, numberOfImages, size, modelOptions, logger } = + options + const imageInputs = options.imageInputs ?? [] + + const unsupportedRole = imageInputs.find( + (part) => + part.metadata?.role === 'mask' || part.metadata?.role === 'control', + ) + if (unsupportedRole) { + throw new Error( + `grok: the Imagine API has no ${unsupportedRole.metadata?.role} input; ` + + `only source/reference images are supported.`, + ) + } + if (imageInputs.length > MAX_EDIT_IMAGES) { + throw new Error( + `grok: model "${model}" accepts at most ${MAX_EDIT_IMAGES} source images; received ${imageInputs.length}.`, + ) + } + + validatePrompt({ prompt, model }) + validateImageSize(model, size) + validateNumberOfImages(model, numberOfImages) + + const urls = imageInputs.map((part) => imagePartToUrl(part)) + const request: Record = { + model, + prompt, + ...(urls.length === 1 + ? { image: { url: urls[0] } } + : { images: urls.map((url) => ({ url })) }), + ...(numberOfImages !== undefined && { n: numberOfImages }), + ...imagineSizeParams(size), + ...modelOptions, + } + + try { + logger.request( + `activity=image provider=${this.name} model=${model} edit images=${urls.length}`, + { provider: this.name, model }, + ) + + const response = await fetch( + `${this.clientConfig.baseURL}/images/edits`, + { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${this.clientConfig.apiKey}`, + }, + body: JSON.stringify(request), + }, + ) + if (!response.ok) { + const body = await response.text() + throw new Error( + `grok: image edit request failed (${response.status} ${response.statusText}): ${body}`, + ) + } + + const result = (await response.json()) as GrokImageEditResponse + const images: Array = (result.data ?? []).flatMap( + (item): Array => { + if (item.b64_json) return [{ b64Json: item.b64_json }] + if (item.url) return [{ url: item.url }] + return [] + }, + ) + if (images.length === 0) { + throw new Error('grok: image edit response contained no images') + } + + return { + id: generateId(this.name), + model, + images, + } + } catch (error: unknown) { + logger.errors(`${this.name}.generateImages fatal`, { + error: toRunErrorPayload(error, `${this.name}.generateImages failed`), + source: `${this.name}.generateImages`, + }) + throw error + } + } } /** diff --git a/packages/ai-grok/src/image/image-provider-options.ts b/packages/ai-grok/src/image/image-provider-options.ts index 9b0d9ee59..da2e52769 100644 --- a/packages/ai-grok/src/image/image-provider-options.ts +++ b/packages/ai-grok/src/image/image-provider-options.ts @@ -10,6 +10,84 @@ */ export type GrokImageSize = '1024x1024' | '1536x1024' | '1024x1536' +/** + * Aspect ratios accepted by the grok-imagine image models. + */ +export type GrokImagineAspectRatio = + | '1:1' + | '3:4' + | '4:3' + | '9:16' + | '16:9' + | '2:3' + | '3:2' + | '9:19.5' + | '19.5:9' + | '9:20' + | '20:9' + | '1:2' + | '2:1' + | 'auto' + +/** + * Resolution tiers for the grok-imagine image models. + */ +export type GrokImagineResolution = '1k' | '2k' + +/** + * Size strings for grok-imagine image models. The Imagine API is + * aspect-ratio based rather than pixel-size based; like Gemini's native + * image models, the generic `size` option uses an + * `aspectRatio_resolution` template ("16:9_2k") — the resolution suffix is + * optional ("16:9" uses the API default of 1k). + */ +export type GrokImagineImageSize = + | GrokImagineAspectRatio + | `${GrokImagineAspectRatio}_${GrokImagineResolution}` + +const GROK_IMAGINE_ASPECT_RATIOS: ReadonlyArray = [ + '1:1', + '3:4', + '4:3', + '9:16', + '16:9', + '2:3', + '3:2', + '9:19.5', + '19.5:9', + '9:20', + '20:9', + '1:2', + '2:1', + 'auto', +] + +const GROK_IMAGINE_RESOLUTIONS: ReadonlyArray = ['1k', '2k'] + +/** + * Models served by xAI's Imagine API. They are aspect-ratio sized and + * support image-conditioned generation via `/v1/images/edits`; the legacy + * grok-2-image-1212 model is pixel-sized and text-to-image only. + */ +export function isGrokImagineImageModel(model: string): boolean { + return model.startsWith('grok-imagine-image') +} + +/** + * Parses a grok-imagine size string into its components. + * Format: "aspectRatio" or "aspectRatio_resolution", + * e.g. "16:9_2k" → { aspectRatio: "16:9", resolution: "2k" }. + * Returns undefined when the string doesn't match the template. + */ +export function parseGrokImagineSize( + size: string, +): { aspectRatio: string; resolution?: string } | undefined { + const match = size.match(/^([\d.]+:[\d.]+|auto)(?:_(.+))?$/) + const [, aspectRatio, resolution] = match ?? [] + if (aspectRatio === undefined) return undefined + return { aspectRatio, ...(resolution !== undefined && { resolution }) } +} + /** * Base provider options for Grok image models */ @@ -39,11 +117,37 @@ export interface GrokImageProviderOptions extends GrokImageBaseProviderOptions { response_format?: 'url' | 'b64_json' } +/** + * Provider options for the grok-imagine image models (generation and + * image-conditioned editing via xAI's Imagine API). + */ +export interface GrokImagineImageProviderOptions extends GrokImageBaseProviderOptions { + /** + * The format in which generated images are returned. + * @default 'url' + */ + response_format?: 'url' | 'b64_json' + + /** + * Output resolution. + * @default '1k' + */ + resolution?: '1k' | '2k' + + /** + * Processing tier for the request. + * @default 'default' + */ + service_tier?: 'default' | 'priority' +} + /** * Type-only map from model name to its specific provider options. */ export type GrokImageModelProviderOptionsByName = { 'grok-2-image-1212': GrokImageProviderOptions + 'grok-imagine-image': GrokImagineImageProviderOptions + 'grok-imagine-image-quality': GrokImagineImageProviderOptions } /** @@ -51,6 +155,8 @@ export type GrokImageModelProviderOptionsByName = { */ export type GrokImageModelSizeByName = { 'grok-2-image-1212': GrokImageSize + 'grok-imagine-image': GrokImagineImageSize + 'grok-imagine-image-quality': GrokImagineImageSize } /** @@ -71,6 +177,23 @@ export function validateImageSize( ): void { if (!size) return + if (isGrokImagineImageModel(model)) { + const parsed = parseGrokImagineSize(size) + if ( + !parsed || + !GROK_IMAGINE_ASPECT_RATIOS.includes(parsed.aspectRatio) || + (parsed.resolution !== undefined && + !GROK_IMAGINE_RESOLUTIONS.includes(parsed.resolution)) + ) { + throw new Error( + `Size "${size}" is not supported by model "${model}". ` + + `Expected an aspect ratio (${GROK_IMAGINE_ASPECT_RATIOS.join(', ')}) ` + + `optionally suffixed with a resolution ("16:9_2k"; resolutions: ${GROK_IMAGINE_RESOLUTIONS.join(', ')}).`, + ) + } + return + } + const validSizes: Record> = { 'grok-2-image-1212': ['1024x1024', '1536x1024', '1024x1536'], } diff --git a/packages/ai-grok/src/model-meta.ts b/packages/ai-grok/src/model-meta.ts index 5b76aaa10..9047cbfeb 100644 --- a/packages/ai-grok/src/model-meta.ts +++ b/packages/ai-grok/src/model-meta.ts @@ -219,6 +219,39 @@ const GROK_2_IMAGE = { }, } as const satisfies ModelMeta +// Imagine API image models. Pricing is per generated image (output only). +const GROK_IMAGINE_IMAGE = { + name: 'grok-imagine-image', + supports: { + input: ['text', 'image'], + output: ['image'], + }, + pricing: { + input: { + normal: 0, + }, + output: { + normal: 0.02, + }, + }, +} as const satisfies ModelMeta + +const GROK_IMAGINE_IMAGE_QUALITY = { + name: 'grok-imagine-image-quality', + supports: { + input: ['text', 'image'], + output: ['image'], + }, + pricing: { + input: { + normal: 0, + }, + output: { + normal: 0.05, + }, + }, +} as const satisfies ModelMeta + /** * Grok Chat Models * Based on xAI's available models as of 2025 @@ -349,7 +382,11 @@ export const GROK_COMBINED_TOOLS_AND_SCHEMA_MODELS = new Set([ /** * Grok Image Generation Models */ -export const GROK_IMAGE_MODELS = [GROK_2_IMAGE.name] as const +export const GROK_IMAGE_MODELS = [ + GROK_2_IMAGE.name, + GROK_IMAGINE_IMAGE.name, + GROK_IMAGINE_IMAGE_QUALITY.name, +] as const // xAI's `/v1/tts` endpoint is endpoint-addressed and does not take a `model` // parameter. This synthetic identifier satisfies the SDK's `TTSOptions.model` diff --git a/packages/ai-grok/tests/grok-adapter.test.ts b/packages/ai-grok/tests/grok-adapter.test.ts index cf148c3e1..584427606 100644 --- a/packages/ai-grok/tests/grok-adapter.test.ts +++ b/packages/ai-grok/tests/grok-adapter.test.ts @@ -189,6 +189,187 @@ describe('Grok adapters', () => { 'XAI_API_KEY is required', ) }) + + it('maps the size template to aspect_ratio/resolution for imagine models', async () => { + const adapter = createGrokImage('grok-imagine-image', 'test-api-key') + const mockGenerate = vi.fn().mockResolvedValue({ + data: [{ url: 'https://example.com/out.png' }], + }) + ;(adapter as any).client = { images: { generate: mockGenerate } } + + await adapter.generateImages({ + model: 'grok-imagine-image', + prompt: 'A skyline', + size: '16:9_2k', + logger: testLogger, + }) + + expect(mockGenerate).toHaveBeenCalledWith( + expect.objectContaining({ + model: 'grok-imagine-image', + aspect_ratio: '16:9', + resolution: '2k', + }), + ) + expect(mockGenerate.mock.calls[0]![0]).not.toHaveProperty('size') + }) + }) + + describe('Image adapter — imageInputs (Imagine edits endpoint)', () => { + const editResponse = (body: Record, ok = true) => + vi.fn().mockResolvedValue({ + ok, + status: ok ? 200 : 422, + statusText: ok ? 'OK' : 'Unprocessable Entity', + json: () => Promise.resolve(body), + text: () => Promise.resolve(JSON.stringify(body)), + }) + + beforeEach(() => { + vi.unstubAllGlobals() + }) + + it('routes a single imageInput to POST /v1/images/edits', async () => { + const mockFetch = editResponse({ + data: [{ url: 'https://example.com/edited.png' }], + }) + vi.stubGlobal('fetch', mockFetch) + + const adapter = createGrokImage('grok-imagine-image', 'test-api-key') + const result = await adapter.generateImages({ + model: 'grok-imagine-image', + prompt: 'Make it a pencil sketch', + imageInputs: [ + { + type: 'image', + source: { type: 'url', value: 'https://example.com/source.png' }, + }, + ], + logger: testLogger, + }) + + expect(mockFetch).toHaveBeenCalledTimes(1) + const [url, init] = mockFetch.mock.calls[0]! + expect(url).toBe('https://api.x.ai/v1/images/edits') + expect(init.headers.Authorization).toBe('Bearer test-api-key') + expect(JSON.parse(init.body)).toMatchObject({ + model: 'grok-imagine-image', + prompt: 'Make it a pencil sketch', + image: { url: 'https://example.com/source.png' }, + }) + expect(result.images).toEqual([{ url: 'https://example.com/edited.png' }]) + }) + + it('sends multiple inputs as images[] and maps size to aspect_ratio', async () => { + const mockFetch = editResponse({ data: [{ b64_json: 'aGVsbG8=' }] }) + vi.stubGlobal('fetch', mockFetch) + + const adapter = createGrokImage( + 'grok-imagine-image-quality', + 'test-api-key', + ) + const result = await adapter.generateImages({ + model: 'grok-imagine-image-quality', + prompt: 'Put in the style of ', + size: '1:1', + imageInputs: [ + { + type: 'image', + source: { type: 'url', value: 'https://example.com/product.png' }, + }, + { + type: 'image', + source: { type: 'data', value: 'c3R5bGU=', mimeType: 'image/png' }, + }, + ], + logger: testLogger, + }) + + const body = JSON.parse(mockFetch.mock.calls[0]![1].body) + expect(body.images).toEqual([ + { url: 'https://example.com/product.png' }, + { url: 'data:image/png;base64,c3R5bGU=' }, + ]) + expect(body.image).toBeUndefined() + expect(body.aspect_ratio).toBe('1:1') + expect(result.images).toEqual([{ b64Json: 'aGVsbG8=' }]) + }) + + it('throws for imageInputs on the legacy grok-2 image model', async () => { + const adapter = createGrokImage('grok-2-image-1212', 'test-api-key') + + await expect( + adapter.generateImages({ + model: 'grok-2-image-1212', + prompt: 'Edit this', + imageInputs: [ + { + type: 'image', + source: { type: 'url', value: 'https://example.com/a.png' }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/does not support imageInputs/) + }) + + it('throws for more than 3 source images', async () => { + const adapter = createGrokImage('grok-imagine-image', 'test-api-key') + const part = { + type: 'image' as const, + source: { type: 'url' as const, value: 'https://example.com/a.png' }, + } + + await expect( + adapter.generateImages({ + model: 'grok-imagine-image', + prompt: 'Combine these', + imageInputs: [part, part, part, part], + logger: testLogger, + }), + ).rejects.toThrow(/at most 3 source images/) + }) + + it('throws for mask/control roles (no Imagine API equivalent)', async () => { + const adapter = createGrokImage('grok-imagine-image', 'test-api-key') + + await expect( + adapter.generateImages({ + model: 'grok-imagine-image', + prompt: 'Inpaint', + imageInputs: [ + { + type: 'image', + source: { type: 'url', value: 'https://example.com/m.png' }, + metadata: { role: 'mask' }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/no mask input/) + }) + + it('throws with response detail on a failed edit request', async () => { + vi.stubGlobal( + 'fetch', + editResponse({ error: 'bad image' }, /* ok */ false), + ) + + const adapter = createGrokImage('grok-imagine-image', 'test-api-key') + await expect( + adapter.generateImages({ + model: 'grok-imagine-image', + prompt: 'Edit', + imageInputs: [ + { + type: 'image', + source: { type: 'url', value: 'https://example.com/a.png' }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/image edit request failed \(422/) + }) }) describe('Summarize adapter', () => { diff --git a/packages/ai-openrouter/src/adapters/image.ts b/packages/ai-openrouter/src/adapters/image.ts index 786b582bd..36ae14bb2 100644 --- a/packages/ai-openrouter/src/adapters/image.ts +++ b/packages/ai-openrouter/src/adapters/image.ts @@ -15,6 +15,8 @@ import type { GeneratedImage, ImageGenerationOptions, ImageGenerationResult, + ImagePart, + MediaInputMetadata, } from '@tanstack/ai' import type { OPENROUTER_IMAGE_MODELS } from '../model-meta' import type { ChatResult } from '@openrouter/sdk/models' @@ -40,6 +42,16 @@ const SIZE_TO_ASPECT_RATIO: Record = { '1536x672': '21:9', } +/** + * Convert a TanStack ImagePart into the URL string accepted by OpenRouter's + * `image_url` content parts: public URLs pass through, data sources become + * base64 data URIs. + */ +function imagePartToUrl(part: ImagePart): string { + if (part.source.type === 'url') return part.source.value + return `data:${part.source.mimeType};base64,${part.source.value}` +} + export class OpenRouterImageAdapter< TModel extends OpenRouterImageModel, > extends BaseImageAdapter< @@ -65,15 +77,9 @@ export class OpenRouterImageAdapter< async generateImages( options: ImageGenerationOptions, ): Promise { - if ( - options.imageInputs?.length || - options.videoInputs?.length || - options.audioInputs?.length - ) { + if (options.videoInputs?.length || options.audioInputs?.length) { throw new Error( - `openrouter.generateImages does not yet support imageInputs / videoInputs / audioInputs. ` + - `Image-conditioned generation via OpenRouter requires injecting parts into the multimodal ` + - `chat-completions messages array; this is tracked at https://github.com/TanStack/ai/issues/618.`, + `openrouter.generateImages does not support videoInputs / audioInputs on model ${this.model}.`, ) } @@ -82,6 +88,23 @@ export class OpenRouterImageAdapter< // Use provided aspect_ratio or derive from size const aspectRatio = size ? SIZE_TO_ASPECT_RATIO[size] : undefined + // Image-conditioned generation: inject inputs as multimodal content + // parts alongside the prompt. OpenRouter forwards them to the + // underlying image model (e.g. Gemini image models). Role hints carry + // no per-field semantics on the chat-completions pathway — inputs are + // attached in order, like the Gemini adapter's multimodal `contents`. + const imageInputs = options.imageInputs ?? [] + const content = + imageInputs.length > 0 + ? [ + { type: 'text' as const, text: prompt }, + ...imageInputs.map((part) => ({ + type: 'image_url' as const, + imageUrl: { url: imagePartToUrl(part) }, + })), + ] + : prompt + logger.request( `activity=generateImage provider=openrouter model=${this.model}`, { @@ -96,7 +119,7 @@ export class OpenRouterImageAdapter< messages: [ { role: 'user', - content: prompt, + content, }, ], modalities: ['image'], diff --git a/packages/ai-openrouter/tests/image-adapter.test.ts b/packages/ai-openrouter/tests/image-adapter.test.ts index 536e47e7e..d62a64fdd 100644 --- a/packages/ai-openrouter/tests/image-adapter.test.ts +++ b/packages/ai-openrouter/tests/image-adapter.test.ts @@ -242,6 +242,88 @@ describe('OpenRouter Image Adapter', () => { ) }) + it('injects imageInputs as multimodal content parts', async () => { + const mockResponse = createMockImageResponse([ + { url: 'https://example.com/edited.png' }, + ]) + + mockSend = vi.fn().mockResolvedValueOnce(mockResponse) + + const adapter = createAdapter() + + const result = await adapter.generateImages({ + model: 'google/gemini-2.5-flash-image', + prompt: 'Turn this into a cinematic product photo', + imageInputs: [ + { + type: 'image', + source: { type: 'url', value: 'https://example.com/source.png' }, + }, + { + type: 'image', + source: { type: 'data', value: 'c3R5bGU=', mimeType: 'image/png' }, + metadata: { role: 'reference' }, + }, + ], + logger: testLogger, + }) + + const callArgs = mockSend.mock.calls[0]![0].chatRequest + expect(callArgs.messages).toEqual([ + { + role: 'user', + content: [ + { type: 'text', text: 'Turn this into a cinematic product photo' }, + { + type: 'image_url', + imageUrl: { url: 'https://example.com/source.png' }, + }, + { + type: 'image_url', + imageUrl: { url: 'data:image/png;base64,c3R5bGU=' }, + }, + ], + }, + ]) + expect(result.images).toHaveLength(1) + }) + + it('keeps a plain string prompt when no imageInputs are given', async () => { + const mockResponse = createMockImageResponse([ + { url: 'https://example.com/image.png' }, + ]) + + mockSend = vi.fn().mockResolvedValueOnce(mockResponse) + + const adapter = createAdapter() + await adapter.generateImages({ + model: 'google/gemini-2.5-flash-image', + prompt: 'A plain prompt', + logger: testLogger, + }) + + const callArgs = mockSend.mock.calls[0]![0].chatRequest + expect(callArgs.messages[0].content).toBe('A plain prompt') + }) + + it('throws for videoInputs / audioInputs', async () => { + const adapter = createAdapter() + + await expect( + adapter.generateImages({ + model: 'google/gemini-2.5-flash-image', + prompt: 'Test', + videoInputs: [ + { + type: 'video', + source: { type: 'url', value: 'https://example.com/v.mp4' }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/does not support videoInputs \/ audioInputs/) + }) + it('passes imageConfig correctly', async () => { const mockResponse = createMockImageResponse([ { url: 'https://example.com/image.png' }, diff --git a/packages/ai/skills/ai-core/media-generation/SKILL.md b/packages/ai/skills/ai-core/media-generation/SKILL.md index af67bf332..536ad7bd8 100644 --- a/packages/ai/skills/ai-core/media-generation/SKILL.md +++ b/packages/ai/skills/ai-core/media-generation/SKILL.md @@ -254,25 +254,25 @@ await generateVideo({ **Role hints** (`metadata.role`): -| Role | Maps to | -| --------------- | ------------------------------------------------------------------------ | -| `'reference'` | fal `reference_image_urls`; Gemini multimodal part; positional otherwise | -| `'character'` | Same as `'reference'`; Veo `referenceImages` slot | -| `'mask'` | OpenAI `mask` (gpt-image-1, dall-e-2); fal `mask_url` | -| `'control'` | fal `control_image_url` (ControlNet / depth / pose) | +| Role | Maps to | +| --------------- | ------------------------------------------------------------------------------------------- | +| `'reference'` | fal `reference_image_urls`; Gemini multimodal part; positional otherwise | +| `'character'` | Same as `'reference'`; Veo `referenceImages` slot | +| `'mask'` | OpenAI `mask` (gpt-image-1, dall-e-2); fal `mask_url` | +| `'control'` | fal `control_image_url` (ControlNet / depth / pose) | | `'start_frame'` | fal `start_image_url` (or the endpoint's field, e.g. `image_url` on Kling i2v); Veo `image` | | `'end_frame'` | fal `end_image_url` (or e.g. `tail_image_url` / `last_frame_url`); Veo `lastFrame` | **Provider support matrix:** -| Provider | `generateImage` `imageInputs` | `generateVideo` `imageInputs` | -| ---------- | ------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------- | -| OpenAI | gpt-image-1 / -mini → `images.edit()` (up to 16). dall-e-2 → edit (1). dall-e-3 throws. | Sora-2 / -pro → `input_reference` (single). Throws if >1. | -| Gemini | Native (gemini-\*-flash-image, "nano-banana") → multimodal `contents`. Imagen throws. | No native Veo adapter yet — deferred to a follow-up. | +| Provider | `generateImage` `imageInputs` | `generateVideo` `imageInputs` | +| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| OpenAI | gpt-image-1 / -mini → `images.edit()` (up to 16). dall-e-2 → edit (1). dall-e-3 throws. | Sora-2 / -pro → `input_reference` (single). Throws if >1. | +| Gemini | Native (gemini-\*-flash-image, "nano-banana") → multimodal `contents`. Imagen throws. | No native Veo adapter yet — deferred to a follow-up. | | fal | Per-endpoint field names from a generated map (`pnpm generate:fal-image-fields`). Defaults: 1 input → `image_url`; >1 → `image_urls`; roles → `mask_url` / `control_image_url` / `reference_image_urls`. | Per-endpoint map (e.g. Kling i2v start frame → `image_url`). Defaults: 1 input → `image_url`; `start_frame`/`end_frame` → `start_image_url`/`end_image_url`; `reference` → `reference_image_urls`. | -| Grok | Throws — adapter uses OpenAI-compat endpoint; native Imagine API rewrite pending. | n/a | -| OpenRouter | Throws — multimodal injection pending. | n/a | -| Anthropic | n/a (no image generation API). | n/a | +| Grok | grok-imagine models → `/v1/images/edits` JSON endpoint (≤3 sources, `` prompt refs; mask/control throw). grok-2-image-1212 throws. | n/a | +| OpenRouter | Inputs injected as multimodal `image_url` content parts in the chat-completions message. | n/a | +| Anthropic | n/a (no image generation API). | n/a | `videoInputs` and `audioInputs` follow the same `metadata.role` convention for video-to-video and lipsync flows on fal; other providers throw when diff --git a/testing/e2e/src/lib/feature-support.ts b/testing/e2e/src/lib/feature-support.ts index bb52c762b..e43acf0d6 100644 --- a/testing/e2e/src/lib/feature-support.ts +++ b/testing/e2e/src/lib/feature-support.ts @@ -180,7 +180,8 @@ export const matrix: Record> = { 'image-gen': new Set(['openai', 'grok']), // image-to-image (imageInputs on generateImage) routes adapters to wire // endpoints aimock doesn't yet mock (OpenAI `/v1/images/edits`, Gemini - // multimodal `generateContent`, fal endpoint-specific input fields). + // multimodal `generateContent`, xAI `/v1/images/edits`, OpenRouter + // multimodal chat content parts, fal endpoint-specific input fields). // Adapter-level mapping is covered by unit tests. Populate this set when // aimock gains support for those endpoints. 'image-to-image': new Set([]), From 34347f73c75f2dae42ab0ce003dde2a2eda4e1d0 Mon Sep 17 00:00:00 2001 From: Tom Beckenham <34339192+tombeckenham@users.noreply.github.com> Date: Fri, 5 Jun 2026 18:10:12 +1000 Subject: [PATCH 05/11] chore: adapt #618 branch to the packages/ restructure and post-rebase API drift - Move the generated fal image-field map and the generator's paths from packages/typescript/ai-fal to packages/ai-fal (repo flattened the layout) - Add gpt-image-2 to EDIT_MAX_IMAGES (new model on main; same 16-image edit limit as the other gpt-image models) - Map edit-path usage through buildImagesUsage to match the new TokenUsage shape, and drop two now-unnecessary type assertions Co-Authored-By: Claude Opus 4.8 (1M context) --- .prettierignore | 2 +- .../image/generated/image-field-overrides.ts | 0 packages/ai-grok/src/adapters/image.ts | 2 +- packages/ai-openai/src/adapters/image.ts | 23 +++++++------------ scripts/generate-fal-image-field-map.ts | 4 ++-- 5 files changed, 12 insertions(+), 19 deletions(-) rename packages/{typescript => }/ai-fal/src/image/generated/image-field-overrides.ts (100%) diff --git a/.prettierignore b/.prettierignore index 1acae11c2..a4770926f 100644 --- a/.prettierignore +++ b/.prettierignore @@ -5,7 +5,7 @@ **/coverage **/dist **/docs -packages/typescript/ai-fal/src/image/generated/ +packages/ai-fal/src/image/generated/ pnpm-lock.yaml .angular diff --git a/packages/typescript/ai-fal/src/image/generated/image-field-overrides.ts b/packages/ai-fal/src/image/generated/image-field-overrides.ts similarity index 100% rename from packages/typescript/ai-fal/src/image/generated/image-field-overrides.ts rename to packages/ai-fal/src/image/generated/image-field-overrides.ts diff --git a/packages/ai-grok/src/adapters/image.ts b/packages/ai-grok/src/adapters/image.ts index 70d8e78b7..5d8d04cf7 100644 --- a/packages/ai-grok/src/adapters/image.ts +++ b/packages/ai-grok/src/adapters/image.ts @@ -144,7 +144,7 @@ export class GrokImageAdapter< ...(isImagine ? imagineSizeParams(size) : size !== undefined && { - size: size as OpenAI_SDK.Images.ImageGenerateParams['size'], + size: size, }), stream: false, ...modelOptions, diff --git a/packages/ai-openai/src/adapters/image.ts b/packages/ai-openai/src/adapters/image.ts index a9a01abff..4da3a2b68 100644 --- a/packages/ai-openai/src/adapters/image.ts +++ b/packages/ai-openai/src/adapters/image.ts @@ -26,12 +26,13 @@ import type { } from '../image/image-provider-options' import type { OpenAIClientConfig } from '../utils/client' -// Per OpenAI docs: dall-e-2 accepts 1 image to `images.edit()`; gpt-image-1 -// and gpt-image-1-mini accept up to 16; dall-e-3 does not support edit at all. +// Per OpenAI docs: dall-e-2 accepts 1 image to `images.edit()`; the +// gpt-image models accept up to 16; dall-e-3 does not support edit at all. const EDIT_MAX_IMAGES: Record = { 'dall-e-2': 1, 'gpt-image-1': 16, 'gpt-image-1-mini': 16, + 'gpt-image-2': 16, 'dall-e-3': 0, } @@ -245,10 +246,7 @@ export class OpenAIImageAdapter< {}) as Partial), } if (size !== undefined) { - request.size = size as Exclude< - OpenAI_SDK.Images.ImageEditParamsNonStreaming['size'], - undefined - > + request.size = size } if (maskFile) { request.mask = maskFile @@ -281,15 +279,10 @@ export class OpenAIImageAdapter< id: generateId(this.name), model, images, - ...(response.usage - ? { - usage: { - inputTokens: response.usage.input_tokens, - outputTokens: response.usage.output_tokens, - totalTokens: response.usage.total_tokens, - }, - } - : {}), + ...(() => { + const usage = buildImagesUsage(response.usage) + return usage ? { usage } : {} + })(), } } catch (error: unknown) { logger.errors(`${this.name}.editImages fatal`, { diff --git a/scripts/generate-fal-image-field-map.ts b/scripts/generate-fal-image-field-map.ts index a128c9574..297819fe8 100644 --- a/scripts/generate-fal-image-field-map.ts +++ b/scripts/generate-fal-image-field-map.ts @@ -5,7 +5,7 @@ * fal endpoints use inconsistent field names for image-conditioned * generation (`image_url` vs `image_urls` vs `first_frame_url` vs * `mask_image_url`, ...). The runtime mapper in - * `packages/typescript/ai-fal/src/image/image-inputs.ts` applies a default + * `packages/ai-fal/src/image/image-inputs.ts` applies a default * field per input role; this script walks every endpoint's input type with * the TypeScript checker and records, per role, the field the endpoint * actually accepts whenever it differs from that default. Endpoints that @@ -31,7 +31,7 @@ import ts from 'typescript' const __dirname = dirname(fileURLToPath(import.meta.url)) const ROOT = resolve(__dirname, '..') -const FAL_PKG = resolve(ROOT, 'packages/typescript/ai-fal') +const FAL_PKG = resolve(ROOT, 'packages/ai-fal') const ENDPOINTS_DTS = resolve( FAL_PKG, 'node_modules/@fal-ai/client/src/types/endpoints.d.ts', From 26f10a96444e775ec8a1dd8484803e59fbed34f6 Mon Sep 17 00:00:00 2001 From: Tom Beckenham <34339192+tombeckenham@users.noreply.github.com> Date: Sun, 7 Jun 2026 10:35:43 +1000 Subject: [PATCH 06/11] feat(ai): make prompt multimodal for generateImage/generateVideo, pass text through verbatim MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the imageInputs / videoInputs / audioInputs fields with a multimodal prompt: string | MediaPromptPart[]. Part order is meaningful — natively multimodal providers (Gemini, OpenRouter) receive parts in interleaved order; named-field providers (OpenAI, fal, xAI) extract media parts via the new resolveMediaPrompt() utility and flatten the text. Zero magic: prompt text is always sent verbatim. The SDK never injects or rewrites in-prompt referencing markers — users write each provider's own convention (fal Kling/Seedance @Image1, OpenAI/FLUX.2 "image 1" prose, Gemini content descriptions), now documented per provider in the media docs. An earlier grok auto-injection was removed after research showed the convention is absent from xAI's official docs (images are addressed by request order). - Per-model compile-time prompt narrowing via TModelInputModalitiesByName adapter generic (e.g. dall-e-3 / Imagen reject image parts as a type error); fal modality maps are derived at the type level from the SDK's endpoint input types - metadata.tag added as an informational label (never read by adapters) - Gemini now preserves true interleaving in contents; OpenRouter maps parts 1:1 onto chat content parts in order Closes #618 Co-Authored-By: Claude Opus 4.8 (1M context) --- .changeset/image-and-video-inputs.md | 18 ++- .gitignore | 1 + docs/adapters/grok.md | 16 ++- docs/media/image-generation.md | 116 +++++++++++----- docs/media/video-generation.md | 58 +++++--- packages/ai-fal/src/adapters/image.ts | 28 ++-- packages/ai-fal/src/adapters/video.ts | 29 ++-- packages/ai-fal/src/image/image-inputs.ts | 4 +- packages/ai-fal/src/model-meta.ts | 42 ++++++ packages/ai-gemini/src/adapters/image.ts | 94 ++++++++----- .../src/image/image-provider-options.ts | 12 ++ packages/ai-grok/src/adapters/image.ts | 33 +++-- .../src/image/image-provider-options.ts | 11 ++ packages/ai-grok/tests/grok-adapter.test.ts | 72 +++++++--- packages/ai-openai/src/adapters/image.ts | 35 +++-- packages/ai-openai/src/adapters/video.ts | 27 ++-- .../src/image/image-provider-options.ts | 13 ++ .../src/video/video-provider-options.ts | 11 ++ .../ai-openai/tests/image-adapter.test.ts | 38 +++--- packages/ai-openrouter/src/adapters/image.ts | 51 ++++--- .../src/image/image-provider-options.ts | 10 ++ .../ai-openrouter/tests/image-adapter.test.ts | 18 +-- .../skills/ai-core/media-generation/SKILL.md | 90 +++++++----- .../src/activities/generateImage/adapter.ts | 19 ++- .../ai/src/activities/generateImage/index.ts | 65 ++++++--- .../src/activities/generateVideo/adapter.ts | 14 +- .../ai/src/activities/generateVideo/index.ts | 71 ++++------ packages/ai/src/index.ts | 4 + packages/ai/src/types.ts | 128 ++++++++++++------ packages/ai/src/utilities/media-prompt.ts | 86 ++++++++++++ .../tests/image-per-model-type-safety.test.ts | 68 +++++++++- packages/ai/tests/media-prompt.test.ts | 79 +++++++++++ testing/e2e/src/lib/feature-support.ts | 12 +- 33 files changed, 984 insertions(+), 389 deletions(-) create mode 100644 packages/ai/src/utilities/media-prompt.ts create mode 100644 packages/ai/tests/media-prompt.test.ts diff --git a/.changeset/image-and-video-inputs.md b/.changeset/image-and-video-inputs.md index 071dcb2e7..da3e301e9 100644 --- a/.changeset/image-and-video-inputs.md +++ b/.changeset/image-and-video-inputs.md @@ -8,16 +8,20 @@ '@tanstack/ai-event-client': patch --- -Add `imageInputs`, `videoInputs`, and `audioInputs` to `generateImage()` and `generateVideo()` for image-conditioned generation, image-to-image, multi-reference, image-to-video, and edit / inpaint flows. Each input part may carry a `metadata.role` hint (`'reference' | 'mask' | 'control' | 'start_frame' | 'end_frame' | 'character'`) that adapters use to route to the provider-specific field. +`generateImage()` and `generateVideo()` now accept a multimodal `prompt`: a plain string, or an ordered array of content parts (`TextPart` / `ImagePart` / `VideoPart` / `AudioPart`) for image-conditioned generation, image-to-image, multi-reference, image-to-video, and edit / inpaint flows. Part order is meaningful — "not like this _(image)_, more like this _(image)_" — and each media part may carry a `metadata.role` hint (`'reference' | 'mask' | 'control' | 'start_frame' | 'end_frame' | 'character'`) that adapters use to route to the provider-specific field, plus an informational `metadata.tag` label for your own bookkeeping. The accepted part types are narrowed per model at compile time via each adapter's input-modality map, so passing an image part to a text-only model is a type error (with a clear runtime throw as backstop). + +Prompt text is always sent **verbatim** — the SDK never injects or rewrites in-prompt referencing markers. To reference inputs from your prompt, write the provider's own convention (fal Kling / Seedance `@Image1`, OpenAI / FLUX.2 `"image 1"` prose, Gemini content descriptions); see the image-generation docs for the per-provider table. Provider behavior in this release: -- **OpenAI image** — `gpt-image-1` / `gpt-image-1-mini` route to `images.edit()` (up to 16 source images plus optional mask); `dall-e-2` routes to `images.edit()` with one source image; `dall-e-3` throws a clear not-supported error. -- **OpenAI video** — Sora-2 / Sora-2-Pro accept a single `input_reference` image; passing more than one throws. -- **Gemini image** — Native models (`gemini-*-flash-image`, "nano-banana") receive inputs as multimodal parts in `contents`. Imagen throws (text-only). -- **fal.ai** — Field names resolve per endpoint from a map generated from the fal SDK's endpoint types (362 endpoints with nonstandard fields, e.g. nano-banana edit → `image_urls`, Kling i2v start frame → `image_url`, Veo first-last-frame → `first_frame_url` / `last_frame_url`). Defaults for endpoints not in the map: single → `image_url`, multiple → `image_urls`; `role: 'mask'` → `mask_url`; `role: 'control'` → `control_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`; video `role: 'start_frame'` / `'end_frame'` → `start_image_url` / `end_image_url`. Regenerate the map after a fal SDK bump with `pnpm generate:fal-image-fields` (a unit test fails when it goes stale). -- **Grok** — New `grok-imagine-image` / `grok-imagine-image-quality` models. With `imageInputs`, they route to xAI's JSON `/v1/images/edits` endpoint (up to 3 source images, referenceable as ``, `` in the prompt; `role: 'mask'` / `'control'` throw). Their `size` uses an `aspectRatio_resolution` template (`'16:9_2k'`, suffix optional) mirroring Gemini's native image models. `grok-2-image-1212` remains text-to-image only and throws on `imageInputs`. -- **OpenRouter** — `imageInputs` are injected as multimodal `image_url` content parts alongside the prompt and forwarded to the underlying image model. URL sources pass through verbatim (no fetching or re-encoding in your process); `data` sources become data URIs. +- **OpenAI image** — Prompts with image parts route `gpt-image-1` / `gpt-image-1-mini` to `images.edit()` (up to 16 source images plus optional mask); `dall-e-2` routes to `images.edit()` with one source image; `dall-e-3` rejects image parts at compile time and at runtime. +- **OpenAI video** — Sora-2 / Sora-2-Pro accept a single image part as `input_reference`; passing more than one throws. +- **Gemini image** — Native models (`gemini-*-flash-image`, "nano-banana") map prompt parts 1:1 onto multimodal `contents`, preserving interleaved order. Imagen is text-only (compile-time + runtime rejection). +- **fal.ai** — Field names resolve per endpoint from a map generated from the fal SDK's endpoint types (362 endpoints with nonstandard fields, e.g. nano-banana edit → `image_urls`, Kling i2v start frame → `image_url`, Veo first-last-frame → `first_frame_url` / `last_frame_url`). Defaults for endpoints not in the map: single → `image_url`, multiple → `image_urls`; `role: 'mask'` → `mask_url`; `role: 'control'` → `control_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`; video `role: 'start_frame'` / `'end_frame'` → `start_image_url` / `end_image_url`. Per-model prompt modalities are derived at the type level from the SDK's endpoint input types. Regenerate the map after a fal SDK bump with `pnpm generate:fal-image-fields` (a unit test fails when it goes stale). +- **Grok** — New `grok-imagine-image` / `grok-imagine-image-quality` models. Prompts with image parts route to xAI's JSON `/v1/images/edits` endpoint (up to 3 source images, addressed by xAI in request order; the prompt is sent verbatim). `role: 'mask'` / `'control'` throw. Their `size` uses an `aspectRatio_resolution` template (`'16:9_2k'`, suffix optional) mirroring Gemini's native image models. `grok-2-image-1212` remains text-to-image only. +- **OpenRouter** — Prompt parts map 1:1 onto multimodal `text` / `image_url` chat content parts, preserving interleaved order, and are forwarded to the underlying image model. URL sources pass through verbatim (no fetching or re-encoding in your process); `data` sources become data URIs. - **Anthropic** — Unchanged (no image generation API). +A new `resolveMediaPrompt()` utility (exported from `@tanstack/ai`) is the single downrev point from the canonical interleaved prompt shape to flattened text + per-modality part buckets, for adapter authors. + Closes #618. diff --git a/.gitignore b/.gitignore index 6678fb779..b261f62d1 100644 --- a/.gitignore +++ b/.gitignore @@ -78,3 +78,4 @@ solo.yml # Agent scratch output (gap-analysis reports, triage notes — generated locally) .agent/gap-analysis/ .agent/triage/ +.agent/research/ diff --git a/docs/adapters/grok.md b/docs/adapters/grok.md index 7c0ef33d2..7103895b4 100644 --- a/docs/adapters/grok.md +++ b/docs/adapters/grok.md @@ -174,15 +174,20 @@ const result = await generateImage({ ### Image Editing (image-to-image) -The grok-imagine models accept `imageInputs` for image-conditioned +The grok-imagine models accept image prompt parts for image-conditioned generation via xAI's `/v1/images/edits` endpoint — up to 3 source images, -referenceable in the prompt as ``, ``: +addressed by xAI in the order they appear in the prompt. Per xAI's docs +there is no in-prompt referencing syntax; write the prompt naturally and +your text is sent verbatim: ```typescript const result = await generateImage({ adapter: grokImage("grok-imagine-image"), - prompt: "Render in the style of ", - imageInputs: [ + prompt: [ + { + type: "text", + content: "Render the product in the style of the second image", + }, { type: "image", source: { type: "url", value: "https://example.com/product.png" }, @@ -197,7 +202,8 @@ const result = await generateImage({ URL sources are fetched by xAI's servers, so they must be publicly reachable; use a `data` source for private images. `grok-2-image-1212` is -text-to-image only and throws when `imageInputs` is passed. +text-to-image only — image prompt parts are a compile-time type error and +throw at runtime. ## Text-to-Speech diff --git a/docs/media/image-generation.md b/docs/media/image-generation.md index 85174a820..816503770 100644 --- a/docs/media/image-generation.md +++ b/docs/media/image-generation.md @@ -76,12 +76,9 @@ All image adapters support these common options: | Option | Type | Description | |--------|------|-------------| | `adapter` | `ImageAdapter` | Image adapter instance with model (required) | -| `prompt` | `string` | Text description of the image to generate (required) | +| `prompt` | `string \| MediaPromptPart[]` | Description of the image to generate (required). A plain string, or — on models that support image-conditioned generation — an ordered array of content parts interleaving text with image inputs. See [Image-Conditioned Generation](#image-conditioned-generation) below. | | `numberOfImages` | `number` | Number of images to generate | | `size` | `string` | Size of the generated image in WIDTHxHEIGHT format | -| `imageInputs?` | `ImagePart[]` | Image conditioning inputs for image-to-image, reference-guided, edit, or multi-reference generation. See [Image-Conditioned Generation](#image-conditioned-generation) below. | -| `videoInputs?` | `VideoPart[]` | Video conditioning inputs. Provider support is limited; most adapters throw. | -| `audioInputs?` | `AudioPart[]` | Audio conditioning inputs. Provider support is limited; most adapters throw. | | `modelOptions?` | `object` | Model-specific options (renamed from `providerOptions`) | ### Size Options @@ -135,26 +132,78 @@ const result = await generateImage({ ## Image-Conditioned Generation -`generateImage()` accepts an optional `imageInputs` field for image-to-image, -reference-guided, multi-reference, and edit / inpaint flows. The field reuses -the same `ImagePart` shape used elsewhere for multimodal content: +For image-to-image, reference-guided, multi-reference, and edit / inpaint +flows, pass the `prompt` as an ordered array of content parts — the same +`TextPart` / `ImagePart` shapes used elsewhere for multimodal content: ```typescript -import { generateImage, type ImagePart } from '@tanstack/ai' +import { generateImage } from '@tanstack/ai' import { openaiImage } from '@tanstack/ai-openai' -const reference: ImagePart = { - type: 'image', - source: { type: 'url', value: 'https://example.com/product.png' }, -} - await generateImage({ adapter: openaiImage('gpt-image-1'), - prompt: 'Turn this into a cinematic product photo', - imageInputs: [reference], + prompt: [ + { type: 'text', content: 'Turn this into a cinematic product photo' }, + { + type: 'image', + source: { type: 'url', value: 'https://example.com/product.png' }, + }, + ], }) ``` +Part order is meaningful. Providers with natively multimodal prompts +(Gemini image models, OpenRouter) receive the parts exactly as written, so +text can refer to its neighbouring images: + +```typescript +await generateImage({ + adapter: geminiImage('gemini-3.1-flash-image-preview'), + prompt: [ + { type: 'text', content: 'Not like this' }, + { type: 'image', source: { type: 'url', value: badExampleUrl } }, + { type: 'text', content: 'more like this' }, + { type: 'image', source: { type: 'url', value: goodExampleUrl } }, + ], +}) +``` + +Providers with named request fields (OpenAI, fal, xAI) extract the image +parts and flatten the text (text parts are joined verbatim, paragraph +separated). + +The accepted part types are narrowed **per model at compile time**: passing +an image part to a text-only model (e.g. `dall-e-3`, Imagen) is a type +error, not just a runtime throw. + +### Referencing images from your prompt + +**Your prompt text is always sent verbatim — the SDK never injects or +rewrites referencing markers.** When you want the text to refer to specific +input images, write the provider's own convention yourself: + +| Provider | Convention | Example | +| -------- | ---------- | ------- | +| **OpenAI** (gpt-image) | Indexed prose, per OpenAI's prompting guide | `"apply the style of image 2 to image 1"` | +| **FLUX.2 on fal / BFL** | Indexed prose (BFL's docs parse `image N`) | `"subject from image 1, style from image 2"` | +| **Gemini** (native image models) | Describe the reference by content/role | `"using the attached fabric sample as the texture"` | +| **fal Kling / Seedance endpoints** | `@`-tags, 1-indexed by input order | `"Put @Image1 in the style of @Image2"` | +| **xAI grok-imagine** | No in-prompt syntax — images addressed in request order | `"render the product in the style of the second image"` | + +To keep track of which part you meant by "image 2" or `@Image2`, you can +label parts with the informational `metadata.tag` field — the SDK ignores +it, but it keeps your code self-documenting: + +```typescript +prompt: [ + { type: 'text', content: 'Put @Image1 in the style of @Image2' }, + { type: 'image', source: { type: 'url', value: productUrl }, + metadata: { tag: 'product' } }, + { type: 'image', source: { type: 'url', value: styleUrl }, + metadata: { tag: 'style' } }, +] +``` + ### Source format `ImagePart.source` is a discriminated union supporting both URLs and inline @@ -192,8 +241,8 @@ mapping. ```typescript await generateImage({ adapter: openaiImage('gpt-image-1'), - prompt: 'Replace the masked region with a tree', - imageInputs: [ + prompt: [ + { type: 'text', content: 'Replace the masked region with a tree' }, { type: 'image', source: { type: 'url', value: photoUrl }, @@ -210,20 +259,23 @@ await generateImage({ #### Multi-reference composition ```typescript -const product: ImagePart = { - type: 'image', - source: { type: 'url', value: 'https://example.com/product.png' }, -} - -const style: ImagePart = { - type: 'image', - source: { type: 'url', value: 'https://example.com/style.png' }, -} - await generateImage({ adapter: geminiImage('gemini-3.1-flash-image-preview'), - prompt: 'Generate a new image of the product using the style of the second reference', - imageInputs: [product, style], + prompt: [ + { + type: 'text', + content: + 'Generate a new image of the product using the style of the second reference', + }, + { + type: 'image', + source: { type: 'url', value: 'https://example.com/product.png' }, + }, + { + type: 'image', + source: { type: 'url', value: 'https://example.com/style.png' }, + }, + ], }) ``` @@ -232,10 +284,10 @@ await generateImage({ | Provider | Behavior | | ------------ | --------------------------------------------------------------------------------------------------------- | | **OpenAI** | `gpt-image-1` / `gpt-image-1-mini` → routes to `images.edit()`, up to 16 source images plus optional mask.
`dall-e-2` → `images.edit()` with 1 source image only.
`dall-e-3` → throws (no edit support). | -| **Gemini** | Native models (`gemini-*-flash-image`, "nano-banana", etc.) → inputs become multimodal parts in `contents`. Up to ~14 input images.
Imagen models → throws (text-to-image only). | +| **Gemini** | Native models (`gemini-*-flash-image`, "nano-banana", etc.) → prompt parts map 1:1 onto multimodal `contents`, preserving interleaved order. Up to ~14 input images.
Imagen models → throws (text-to-image only). | | **fal.ai** | Field names resolve per endpoint from a map generated from the fal SDK's endpoint types (e.g. nano-banana edit gets `image_urls`, Fooocus masks get `mask_image_url`). Defaults for unknown endpoints: 1 input → `image_url`; multiple → `image_urls`; `role: 'mask'` → `mask_url`; `role: 'control'` → `control_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`. Override with `modelOptions` for endpoint-specific fields. | -| **Grok** | grok-imagine models → xAI's `/v1/images/edits` (up to 3 source images, referenceable as ``, `` in the prompt). `role: 'mask'` / `'control'` throw (no Imagine API equivalent). `grok-2-image-1212` throws (text-to-image only). | -| **OpenRouter** | Inputs are injected as multimodal `image_url` content parts alongside the prompt and forwarded to the underlying image model. | +| **Grok** | grok-imagine models → xAI's `/v1/images/edits` (up to 3 source images, addressed by xAI in request order; prompt sent verbatim). `role: 'mask'` / `'control'` throw (no Imagine API equivalent). `grok-2-image-1212` throws (text-to-image only). | +| **OpenRouter** | Prompt parts map 1:1 onto multimodal `image_url` / `text` content parts, preserving interleaved order, and are forwarded to the underlying image model. | | **Anthropic** | n/a — no image generation API. | Adapters that don't support image-conditioned generation throw a clear diff --git a/docs/media/video-generation.md b/docs/media/video-generation.md index 0ff38767b..e22056b04 100644 --- a/docs/media/video-generation.md +++ b/docs/media/video-generation.md @@ -363,39 +363,53 @@ And returns: | Option | Type | Description | |--------|------|-------------| | `adapter` | `VideoAdapter` | Video adapter instance with model (required) | -| `prompt` | `string` | Text description of the video to generate (required) | +| `prompt` | `string \| MediaPromptPart[]` | Description of the video to generate (required). A plain string, or — on models that support conditioned generation — an ordered array of content parts interleaving text with image / video / audio inputs. See [Image-to-Video](#image-to-video) below. | | `size` | `string` | Video resolution in WIDTHxHEIGHT format | | `duration` | `number` | Video duration in seconds (maps to `seconds` parameter in API) | -| `imageInputs?` | `ImagePart[]` | Image conditioning inputs — starting frame, end frame, character / reference images. See [Image-to-Video](#image-to-video) below. | -| `videoInputs?` | `VideoPart[]` | Video conditioning inputs for video-to-video / source clip flows. Provider support varies. | -| `audioInputs?` | `AudioPart[]` | Audio conditioning inputs for lipsync / voice cloning flows. Provider support varies. | | `modelOptions?` | `object` | Model-specific options (renamed from `providerOptions`) | ## Image-to-Video -`generateVideo()` accepts `imageInputs` for starting-frame, ending-frame, -and reference-image conditioned video generation: +For starting-frame, ending-frame, and reference-image conditioned video +generation, pass the `prompt` as an array of content parts: ```typescript -import { generateVideo, type ImagePart } from '@tanstack/ai' +import { generateVideo } from '@tanstack/ai' import { openaiVideo } from '@tanstack/ai-openai' -const startingFrame: ImagePart = { - type: 'image', - source: { - type: 'data', - value: base64Image, - mimeType: 'image/png', - }, -} - const { jobId } = await generateVideo({ adapter: openaiVideo('sora-2'), - prompt: 'Animate this still into a slow cinematic push-in with subtle motion', - imageInputs: [startingFrame], + prompt: [ + { + type: 'text', + content: + 'Animate this still into a slow cinematic push-in with subtle motion', + }, + { + type: 'image', + source: { + type: 'data', + value: base64Image, + mimeType: 'image/png', + }, + }, + ], }) ``` +The accepted part types are narrowed **per model at compile time** — fal +endpoints, for example, only admit image / video / audio parts that their +SDK input type actually declares fields for. + +Prompt text is always sent **verbatim** — the SDK never injects or rewrites +in-prompt referencing markers. Some fal video endpoints have their own +referencing syntax you can write directly in your text (e.g. Kling v3 +elements as `@Element1`, Seedance 2.0 reference-to-video as `@Image1` / +`@Video1` / `@Audio1`, 1-indexed by input order); Veo and Sora take +reference images as plain inputs with naturally written prompts. See +[Referencing images from your prompt](./image-generation.md#referencing-images-from-your-prompt) +for the per-provider table. + ### Role hints Each `ImagePart` can carry an optional `metadata.role` hint that the @@ -413,9 +427,9 @@ import { falVideo } from '@tanstack/ai-fal' await generateVideo({ adapter: falVideo('fal-ai/kling-video/v3/pro/image-to-video'), - prompt: 'Slow cinematic push-in then a hard cut', - imageInputs: [ + prompt: [ { type: 'image', source: { type: 'url', value: firstFrameUrl } }, + { type: 'text', content: 'Slow cinematic push-in then a hard cut' }, { type: 'image', source: { type: 'url', value: lastFrameUrl }, @@ -429,9 +443,9 @@ await generateVideo({ | Provider | Image-to-Video Behavior | | ------------ | -------------------------------------------------------------------------------------------------------- | -| **OpenAI** | Sora-2 / Sora-2-Pro → first input goes to `input_reference`. Single image only — throws if more than one. | +| **OpenAI** | Sora-2 / Sora-2-Pro → the image part goes to `input_reference`; flattened text is the prompt. Single image only — throws if more than one. | | **fal.ai** | Field names resolve per endpoint from a map generated from the fal SDK's endpoint types — e.g. `role: 'start_frame'` lands on `image_url` for Kling/Veo image-to-video, `first_frame_url` for first-last-frame endpoints, and `start_image_url` otherwise. Defaults: single input → `image_url` (start frame); `role: 'end_frame'` → `end_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`. Override per-endpoint via `modelOptions`. | -| **Gemini** | Veo adapter not yet implemented — `imageInputs` will be supported when Veo lands. | +| **Gemini** | Veo adapter not yet implemented — image prompt parts will be supported when Veo lands. | Adapters whose underlying API can't accept image inputs throw a clear runtime error so calls fail fast. diff --git a/packages/ai-fal/src/adapters/image.ts b/packages/ai-fal/src/adapters/image.ts index 88c885a74..b4327d40f 100644 --- a/packages/ai-fal/src/adapters/image.ts +++ b/packages/ai-fal/src/adapters/image.ts @@ -1,4 +1,5 @@ import { fal } from '@fal-ai/client' +import { resolveMediaPrompt } from '@tanstack/ai' import { BaseImageAdapter } from '@tanstack/ai/adapters' import { buildFalUsage, @@ -14,8 +15,10 @@ import type { GeneratedImage, ImageGenerationOptions, ImageGenerationResult, + ResolvedMediaPrompt, } from '@tanstack/ai' import type { + FalImagePromptModalitiesFor, FalImageProviderOptions, FalModel, FalModelImageSize, @@ -46,7 +49,8 @@ export class FalImageAdapter extends BaseImageAdapter< TModel, FalImageProviderOptions, Record>, - Record> + Record>, + Record> > { override readonly kind = 'image' as const readonly name = 'fal' as const @@ -69,19 +73,21 @@ export class FalImageAdapter extends BaseImageAdapter< model: this.model, }) - if (options.videoInputs?.length) { + const resolved = resolveMediaPrompt(options.prompt) + + if (resolved.videos.length > 0) { throw new Error( - `fal.generateImages does not support videoInputs on model ${this.model}.`, + `fal.generateImages does not support video prompt parts on model ${this.model}.`, ) } - if (options.audioInputs?.length) { + if (resolved.audios.length > 0) { throw new Error( - `fal.generateImages does not support audioInputs on model ${this.model}.`, + `fal.generateImages does not support audio prompt parts on model ${this.model}.`, ) } try { - const input = this.buildInput(options) + const input = this.buildInput(options, resolved) const result = await fal.subscribe(this.model, { input }) return this.transformResponse(result) } catch (error) { @@ -98,20 +104,20 @@ export class FalImageAdapter extends BaseImageAdapter< FalImageProviderOptions, FalModelImageSize >, + resolved: ResolvedMediaPrompt, ): FalModelInput { const sizeParams = mapSizeToFalFormat(options.size) // Order matters: modelOptions first (so user overrides win for // mask_url / control_image_url / reference_image_urls), then size, // then derived image-input fields, then prompt / num_images. - const inputFields = mapImageInputsToFalFields( - this.model, - options.imageInputs, - ) + const inputFields = mapImageInputsToFalFields(this.model, resolved.images) const input = { ...options.modelOptions, ...sizeParams, ...inputFields, - prompt: options.prompt, + // Media-only prompts (e.g. upscalers, background removal) omit the + // prompt field entirely rather than sending an empty string. + ...(resolved.text ? { prompt: resolved.text } : {}), num_images: options.numberOfImages, } as FalModelInput return input diff --git a/packages/ai-fal/src/adapters/video.ts b/packages/ai-fal/src/adapters/video.ts index a46db5bc5..7d8e424be 100644 --- a/packages/ai-fal/src/adapters/video.ts +++ b/packages/ai-fal/src/adapters/video.ts @@ -1,4 +1,5 @@ import { fal } from '@fal-ai/client' +import { resolveMediaPrompt } from '@tanstack/ai' import { BaseVideoAdapter } from '@tanstack/ai/adapters' import { buildFalUsage, @@ -21,6 +22,7 @@ import type { FalModel, FalModelInput, FalModelVideoSize, + FalVideoPromptModalitiesFor, FalVideoProviderOptions, } from '../model-meta' import type { FalClientConfig } from '../utils' @@ -65,7 +67,7 @@ function mapAudioInputsToFalFields( const [part, ...rest] = audioInputs if (!part || rest.length > 0) { throw new Error( - `fal: exactly one audioInput is supported (received ${audioInputs.length}).`, + `fal: exactly one audio prompt part is supported (received ${audioInputs.length}).`, ) } return { @@ -130,7 +132,8 @@ export class FalVideoAdapter extends BaseVideoAdapter< TModel, FalVideoProviderOptions, Record>, - Record> + Record>, + Record> > { override readonly kind = 'video' as const readonly name = 'fal' as const @@ -146,16 +149,7 @@ export class FalVideoAdapter extends BaseVideoAdapter< FalModelVideoSize >, ): Promise { - const { - prompt, - size, - duration, - modelOptions, - logger, - imageInputs, - videoInputs, - audioInputs, - } = options + const { size, duration, modelOptions, logger } = options logger.request(`activity=generateVideo provider=fal model=${this.model}`, { provider: 'fal', @@ -163,13 +157,14 @@ export class FalVideoAdapter extends BaseVideoAdapter< }) try { + const resolved = resolveMediaPrompt(options.prompt) const sizeParams = mapVideoSizeToFalFormat(size) const inputImageFields = mapImageInputsToFalVideoFields( this.model, - imageInputs, + resolved.images, ) - const videoFields = mapVideoInputsToFalFields(videoInputs) - const audioFields = mapAudioInputsToFalFields(audioInputs) + const videoFields = mapVideoInputsToFalFields(resolved.videos) + const audioFields = mapAudioInputsToFalFields(resolved.audios) const input = { ...modelOptions, @@ -177,7 +172,9 @@ export class FalVideoAdapter extends BaseVideoAdapter< ...inputImageFields, ...videoFields, ...audioFields, - prompt, + // Media-only prompts omit the prompt field rather than sending an + // empty string (e.g. pure image-to-video endpoints). + ...(resolved.text ? { prompt: resolved.text } : {}), ...(duration ? { duration } : {}), } as FalModelInput diff --git a/packages/ai-fal/src/image/image-inputs.ts b/packages/ai-fal/src/image/image-inputs.ts index f6943b837..377f9d3fd 100644 --- a/packages/ai-fal/src/image/image-inputs.ts +++ b/packages/ai-fal/src/image/image-inputs.ts @@ -121,7 +121,7 @@ function bucketByRole( } /** - * Map TanStack `imageInputs` onto fal.ai image-endpoint fields. + * Map the prompt's image parts onto fal.ai image-endpoint fields. * * fal endpoints use different field names for image-conditioned generation * (~80% use `image_url` for single; the rest use `image_urls`, @@ -177,7 +177,7 @@ export function mapImageInputsToFalFields( } /** - * Map TanStack `imageInputs` onto fal.ai video-endpoint fields. + * Map the prompt's image parts onto fal.ai video-endpoint fields. * * Video endpoints often expose a start frame as `image_url` (76% of i2v * models) plus an optional `end_image_url`. Multi-reference video models diff --git a/packages/ai-fal/src/model-meta.ts b/packages/ai-fal/src/model-meta.ts index 5aa5367ca..d69bb704b 100644 --- a/packages/ai-fal/src/model-meta.ts +++ b/packages/ai-fal/src/model-meta.ts @@ -4,6 +4,8 @@ * These types give you full autocomplete and type safety for any model. */ import type { EndpointTypeMap } from '@fal-ai/client/endpoints' +import type { MediaPromptModality } from '@tanstack/ai' +import type { FalImageFieldName } from './image/generated/image-field-overrides' export type { EndpointTypeMap } from '@fal-ai/client/endpoints' @@ -118,6 +120,46 @@ export type FalModelVideoSizeInput = : never : { aspect_ratio?: string; resolution?: string } +/** + * Prompt input modalities for a fal image endpoint, derived from the SDK's + * endpoint input type: an endpoint accepts image prompt parts exactly when + * its input declares one of the known image-conditioning fields + * (`image_url`, `image_urls`, `mask_url`, …). Endpoints unknown to the + * installed SDK are unconstrained. + */ +export type FalImagePromptModalitiesFor = + TModel extends keyof EndpointTypeMap + ? ReadonlyArray< + Extract, FalImageFieldName> extends never + ? never + : 'image' + > + : ReadonlyArray + +/** + * Prompt input modalities for a fal video endpoint. Image conditioning is + * detected via the same field set as image endpoints; video conditioning via + * `video_url` / `video_urls` / `reference_video_urls`; audio conditioning + * via `audio_url`. Endpoints unknown to the installed SDK are unconstrained. + */ +export type FalVideoPromptModalitiesFor = + TModel extends keyof EndpointTypeMap + ? ReadonlyArray< + | (Extract, FalImageFieldName> extends never + ? never + : 'image') + | (Extract< + keyof FalModelInput, + 'video_url' | 'video_urls' | 'reference_video_urls' + > extends never + ? never + : 'video') + | (Extract, 'audio_url'> extends never + ? never + : 'audio') + > + : ReadonlyArray + /** * Provider options for video generation, excluding fields TanStack AI handles. * Use this for the `modelOptions` parameter in video generation. diff --git a/packages/ai-gemini/src/adapters/image.ts b/packages/ai-gemini/src/adapters/image.ts index 0fbbfadd2..b107df197 100644 --- a/packages/ai-gemini/src/adapters/image.ts +++ b/packages/ai-gemini/src/adapters/image.ts @@ -1,3 +1,4 @@ +import { resolveMediaPrompt } from '@tanstack/ai' import { BaseImageAdapter } from '@tanstack/ai/adapters' import { arrayBufferToBase64 } from '@tanstack/ai-utils' import { @@ -15,6 +16,7 @@ import { } from '../image/image-provider-options' import type { GEMINI_IMAGE_MODELS } from '../model-meta' import type { + GeminiImageModelInputModalitiesByName, GeminiImageModelProviderOptionsByName, GeminiImageModelSizeByName, GeminiImageProviderOptions, @@ -25,6 +27,7 @@ import type { ImageGenerationResult, ImagePart, MediaInputMetadata, + ResolvedMediaPrompt, } from '@tanstack/ai' import type { Content, @@ -65,7 +68,8 @@ export class GeminiImageAdapter< TModel, GeminiImageProviderOptions, GeminiImageModelProviderOptionsByName, - GeminiImageModelSizeByName + GeminiImageModelSizeByName, + GeminiImageModelInputModalitiesByName > { override readonly kind = 'image' as const readonly name = 'gemini' as const @@ -75,6 +79,7 @@ export class GeminiImageAdapter< providerOptions: GeminiImageProviderOptions modelProviderOptionsByName: GeminiImageModelProviderOptionsByName modelSizeByName: GeminiImageModelSizeByName + modelInputModalitiesByName: GeminiImageModelInputModalitiesByName } private readonly client: GoogleGenAI @@ -87,7 +92,7 @@ export class GeminiImageAdapter< async generateImages( options: ImageGenerationOptions, ): Promise { - const { model, prompt, logger } = options + const { model, logger } = options logger.request( `activity=generateImage provider=gemini model=${this.model}`, @@ -98,27 +103,33 @@ export class GeminiImageAdapter< ) try { - validatePrompt({ prompt, model }) + const resolved = resolveMediaPrompt(options.prompt) + + // Image-only prompts are allowed (the image inputs carry the intent); + // a prompt with neither text nor images is always an error. + if (resolved.images.length === 0) { + validatePrompt({ prompt: resolved.text, model }) + } - if (options.videoInputs?.length) { + if (resolved.videos.length > 0) { throw new Error( - `${this.name}.generateImages does not support videoInputs (model: ${model}).`, + `${this.name}.generateImages does not support video prompt parts (model: ${model}).`, ) } - if (options.audioInputs?.length) { + if (resolved.audios.length > 0) { throw new Error( - `${this.name}.generateImages does not support audioInputs (model: ${model}).`, + `${this.name}.generateImages does not support audio prompt parts (model: ${model}).`, ) } if (this.isGeminiImageModel(model)) { - return await this.generateWithGeminiApi(options) + return await this.generateWithGeminiApi(options, resolved) } // Imagen does not accept image inputs — it's strictly text-to-image. - if (options.imageInputs?.length) { + if (resolved.images.length > 0) { throw new Error( - `${this.name}: model "${model}" (Imagen) does not support imageInputs. ` + + `${this.name}: model "${model}" (Imagen) does not support image prompt parts. ` + `Use a Gemini-native image model (e.g. gemini-2.5-flash-image, "nano-banana") for image-conditioned generation.`, ) } @@ -131,7 +142,7 @@ export class GeminiImageAdapter< const response = await this.client.models.generateImages({ model, - prompt, + prompt: resolved.text, config, }) @@ -151,19 +162,12 @@ export class GeminiImageAdapter< private async generateWithGeminiApi( options: ImageGenerationOptions, + resolved: ResolvedMediaPrompt, ): Promise { - const { model, prompt, size, numberOfImages, modelOptions, imageInputs } = - options + const { model, size, numberOfImages, modelOptions } = options const parsedSize = size ? parseNativeImageSize(size) : undefined - // The generateContent API has no numberOfImages parameter. - // Instead, augment the prompt to request multiple images when needed. - const augmentedPrompt = - numberOfImages && numberOfImages > 1 - ? `${prompt} Generate ${numberOfImages} distinct images.` - : prompt - // GeminiImageProviderOptions is Imagen-shaped — most fields // (personGeneration, safetyFilterLevel, addWatermark, outputMimeType, // outputCompressionQuality, guidanceScale, enhancePrompt, @@ -195,7 +199,7 @@ export class GeminiImageAdapter< }), } - const contents = await this.buildContents(augmentedPrompt, imageInputs) + const contents = await this.buildContents(resolved, numberOfImages) const response = await this.client.models.generateContent({ model, @@ -207,23 +211,47 @@ export class GeminiImageAdapter< } /** - * Build the multimodal `contents` payload. When `imageInputs` is empty the - * SDK accepts a plain prompt string; with inputs we hand it a single user - * `Content` whose `parts` interleave the inline/file image data with the - * text prompt last (Gemini conventionally treats the trailing text as the - * instruction). + * Build the multimodal `contents` payload. Text-only prompts pass through + * as a plain string (the SDK accepts it directly); prompts with image + * parts become a single user `Content` whose `parts` mirror the prompt's + * interleaved order — position is meaningful to Gemini ("not like this + * *(image)*, more like this *(image)*"). + * + * The generateContent API has no numberOfImages parameter, so when more + * than one image is requested a trailing instruction is appended. */ private async buildContents( - prompt: string, - imageInputs?: ReadonlyArray>, + resolved: ResolvedMediaPrompt, + numberOfImages: number | undefined, ): Promise> { - if (!imageInputs || imageInputs.length === 0) { - return prompt + const countInstruction = + numberOfImages && numberOfImages > 1 + ? `Generate ${numberOfImages} distinct images.` + : undefined + + if (resolved.images.length === 0) { + return countInstruction + ? `${resolved.text} ${countInstruction}` + : resolved.text } - const imageParts: Array = await Promise.all( - imageInputs.map((part) => this.imagePartToGeminiPart(part)), + + const parts: Array = await Promise.all( + resolved.parts.map((part) => { + if (part.type === 'text') { + return Promise.resolve({ text: part.content }) + } + if (part.type === 'image') { + return this.imagePartToGeminiPart(part) + } + // Video / audio parts were rejected in generateImages above. + throw new Error( + `gemini: unsupported prompt part type "${part.type}" in image generation.`, + ) + }), ) - const parts: Array = [...imageParts, { text: prompt }] + if (countInstruction) { + parts.push({ text: countInstruction }) + } return [{ role: 'user', parts }] } diff --git a/packages/ai-gemini/src/image/image-provider-options.ts b/packages/ai-gemini/src/image/image-provider-options.ts index 62a933445..9125cde97 100644 --- a/packages/ai-gemini/src/image/image-provider-options.ts +++ b/packages/ai-gemini/src/image/image-provider-options.ts @@ -189,6 +189,18 @@ export type GeminiImageModelSizeByName = { [K in Exclude]: GeminiImageSize } +/** + * Per-model prompt input modalities. Gemini-native image models accept image + * parts in the multimodal prompt (image-conditioned generation via + * generateContent); Imagen models are strictly text-to-image, so their + * `prompt` is constrained to text at compile time. + */ +export type GeminiImageModelInputModalitiesByName = { + [K in GeminiNativeImageModels]: readonly ['image'] +} & { + [K in Exclude]: readonly [] +} + /** * Valid sizes for Gemini Imagen models * Gemini uses aspect ratios, but we map common WIDTHxHEIGHT formats to aspect ratios diff --git a/packages/ai-grok/src/adapters/image.ts b/packages/ai-grok/src/adapters/image.ts index 5d8d04cf7..4a4641dae 100644 --- a/packages/ai-grok/src/adapters/image.ts +++ b/packages/ai-grok/src/adapters/image.ts @@ -1,4 +1,5 @@ import OpenAI from 'openai' +import { resolveMediaPrompt } from '@tanstack/ai' import { BaseImageAdapter } from '@tanstack/ai/adapters' import { toRunErrorPayload } from '@tanstack/ai/adapter-internals' import { buildImagesUsage } from '@tanstack/openai-base' @@ -17,10 +18,12 @@ import type { ImageGenerationResult, ImagePart, MediaInputMetadata, + ResolvedMediaPrompt, } from '@tanstack/ai' import type OpenAI_SDK from 'openai' import type { GrokImageModel } from '../model-meta' import type { + GrokImageModelInputModalitiesByName, GrokImageModelProviderOptionsByName, GrokImageModelSizeByName, GrokImageProviderOptions, @@ -78,7 +81,7 @@ interface GrokImageEditResponse { * Tree-shakeable adapter for Grok image generation functionality. * Supports the legacy grok-2-image-1212 model (text-to-image via the * OpenAI-compat endpoint) and the grok-imagine image models, which also - * accept `imageInputs` for image-conditioned generation via xAI's + * accept image prompt parts for image-conditioned generation via xAI's * `/v1/images/edits` endpoint (up to 3 source images). * * Features: @@ -92,7 +95,8 @@ export class GrokImageAdapter< TModel, GrokImageProviderOptions, GrokImageModelProviderOptionsByName, - GrokImageModelSizeByName + GrokImageModelSizeByName, + GrokImageModelInputModalitiesByName > { override readonly kind = 'image' as const readonly name = 'grok' as const @@ -109,23 +113,26 @@ export class GrokImageAdapter< async generateImages( options: ImageGenerationOptions, ): Promise { - const { model, prompt, numberOfImages, size, modelOptions } = options + const { model, numberOfImages, size, modelOptions } = options + + const resolved = resolveMediaPrompt(options.prompt) + const prompt = resolved.text - if (options.videoInputs?.length || options.audioInputs?.length) { + if (resolved.videos.length > 0 || resolved.audios.length > 0) { throw new Error( - `grok.generateImages does not support videoInputs / audioInputs on model ${model}.`, + `grok.generateImages does not support video / audio prompt parts on model ${model}.`, ) } - if (options.imageInputs?.length) { + if (resolved.images.length > 0) { if (!isGrokImagineImageModel(model)) { throw new Error( - `grok: model "${model}" does not support imageInputs. ` + + `grok: model "${model}" does not support image prompt parts. ` + `Image-conditioned generation requires an Imagine API model ` + `('grok-imagine-image' or 'grok-imagine-image-quality').`, ) } - return await this.editImages(options) + return await this.editImages(options, resolved) } validatePrompt({ prompt, model }) @@ -204,14 +211,16 @@ export class GrokImageAdapter< * SDK's `images.edit()` sends `multipart/form-data`, which xAI rejects), * so this path issues the request directly. One input is sent as * `image: { url }`; multiple inputs (up to 3) as `images: [{ url }, ...]`, - * referenceable in the prompt as ``, ``, ... + * addressed by xAI in the order they are sent. The prompt text is sent + * verbatim — no referencing markers are injected. */ private async editImages( options: ImageGenerationOptions, + resolved: ResolvedMediaPrompt, ): Promise { - const { model, prompt, numberOfImages, size, modelOptions, logger } = - options - const imageInputs = options.imageInputs ?? [] + const { model, numberOfImages, size, modelOptions, logger } = options + const prompt = resolved.text + const imageInputs = resolved.images const unsupportedRole = imageInputs.find( (part) => diff --git a/packages/ai-grok/src/image/image-provider-options.ts b/packages/ai-grok/src/image/image-provider-options.ts index da2e52769..a43353444 100644 --- a/packages/ai-grok/src/image/image-provider-options.ts +++ b/packages/ai-grok/src/image/image-provider-options.ts @@ -159,6 +159,17 @@ export type GrokImageModelSizeByName = { 'grok-imagine-image-quality': GrokImagineImageSize } +/** + * Per-model prompt input modalities. Imagine API models accept image parts + * in the prompt (routed to `/v1/images/edits`, up to 3 images, addressed by + * xAI in request order); grok-2-image is text-to-image only. + */ +export type GrokImageModelInputModalitiesByName = { + 'grok-2-image-1212': readonly [] + 'grok-imagine-image': readonly ['image'] + 'grok-imagine-image-quality': readonly ['image'] +} + /** * Internal options interface for validation */ diff --git a/packages/ai-grok/tests/grok-adapter.test.ts b/packages/ai-grok/tests/grok-adapter.test.ts index 584427606..85913339d 100644 --- a/packages/ai-grok/tests/grok-adapter.test.ts +++ b/packages/ai-grok/tests/grok-adapter.test.ts @@ -215,7 +215,7 @@ describe('Grok adapters', () => { }) }) - describe('Image adapter — imageInputs (Imagine edits endpoint)', () => { + describe('Image adapter — image prompt parts (Imagine edits endpoint)', () => { const editResponse = (body: Record, ok = true) => vi.fn().mockResolvedValue({ ok, @@ -229,7 +229,7 @@ describe('Grok adapters', () => { vi.unstubAllGlobals() }) - it('routes a single imageInput to POST /v1/images/edits', async () => { + it('routes a single image part to POST /v1/images/edits with the prompt sent verbatim', async () => { const mockFetch = editResponse({ data: [{ url: 'https://example.com/edited.png' }], }) @@ -238,8 +238,8 @@ describe('Grok adapters', () => { const adapter = createGrokImage('grok-imagine-image', 'test-api-key') const result = await adapter.generateImages({ model: 'grok-imagine-image', - prompt: 'Make it a pencil sketch', - imageInputs: [ + prompt: [ + { type: 'text', content: 'Make it a pencil sketch' }, { type: 'image', source: { type: 'url', value: 'https://example.com/source.png' }, @@ -260,7 +260,37 @@ describe('Grok adapters', () => { expect(result.images).toEqual([{ url: 'https://example.com/edited.png' }]) }) - it('sends multiple inputs as images[] and maps size to aspect_ratio', async () => { + it('flattens interleaved text verbatim — no markers are injected', async () => { + const mockFetch = editResponse({ data: [{ b64_json: 'aGVsbG8=' }] }) + vi.stubGlobal('fetch', mockFetch) + + const adapter = createGrokImage('grok-imagine-image', 'test-api-key') + await adapter.generateImages({ + model: 'grok-imagine-image', + prompt: [ + { type: 'text', content: 'Not like' }, + { + type: 'image', + source: { type: 'url', value: 'https://example.com/bad.png' }, + }, + { type: 'text', content: 'more like' }, + { + type: 'image', + source: { type: 'url', value: 'https://example.com/good.png' }, + }, + ], + logger: testLogger, + }) + + const body = JSON.parse(mockFetch.mock.calls[0]![1].body) + expect(body.prompt).toBe('Not like\n\nmore like') + expect(body.images).toEqual([ + { url: 'https://example.com/bad.png' }, + { url: 'https://example.com/good.png' }, + ]) + }) + + it('passes user-written referencing text through verbatim, sends images[] and maps size', async () => { const mockFetch = editResponse({ data: [{ b64_json: 'aGVsbG8=' }] }) vi.stubGlobal('fetch', mockFetch) @@ -270,9 +300,8 @@ describe('Grok adapters', () => { ) const result = await adapter.generateImages({ model: 'grok-imagine-image-quality', - prompt: 'Put in the style of ', - size: '1:1', - imageInputs: [ + prompt: [ + { type: 'text', content: 'Put in the style of ' }, { type: 'image', source: { type: 'url', value: 'https://example.com/product.png' }, @@ -282,10 +311,12 @@ describe('Grok adapters', () => { source: { type: 'data', value: 'c3R5bGU=', mimeType: 'image/png' }, }, ], + size: '1:1', logger: testLogger, }) const body = JSON.parse(mockFetch.mock.calls[0]![1].body) + expect(body.prompt).toBe('Put in the style of ') expect(body.images).toEqual([ { url: 'https://example.com/product.png' }, { url: 'data:image/png;base64,c3R5bGU=' }, @@ -295,14 +326,14 @@ describe('Grok adapters', () => { expect(result.images).toEqual([{ b64Json: 'aGVsbG8=' }]) }) - it('throws for imageInputs on the legacy grok-2 image model', async () => { + it('throws for image prompt parts on the legacy grok-2 image model', async () => { const adapter = createGrokImage('grok-2-image-1212', 'test-api-key') await expect( adapter.generateImages({ model: 'grok-2-image-1212', - prompt: 'Edit this', - imageInputs: [ + prompt: [ + { type: 'text', content: 'Edit this' }, { type: 'image', source: { type: 'url', value: 'https://example.com/a.png' }, @@ -310,7 +341,7 @@ describe('Grok adapters', () => { ], logger: testLogger, }), - ).rejects.toThrow(/does not support imageInputs/) + ).rejects.toThrow(/does not support image prompt parts/) }) it('throws for more than 3 source images', async () => { @@ -323,8 +354,13 @@ describe('Grok adapters', () => { await expect( adapter.generateImages({ model: 'grok-imagine-image', - prompt: 'Combine these', - imageInputs: [part, part, part, part], + prompt: [ + { type: 'text', content: 'Combine these' }, + part, + part, + part, + part, + ], logger: testLogger, }), ).rejects.toThrow(/at most 3 source images/) @@ -336,8 +372,8 @@ describe('Grok adapters', () => { await expect( adapter.generateImages({ model: 'grok-imagine-image', - prompt: 'Inpaint', - imageInputs: [ + prompt: [ + { type: 'text', content: 'Inpaint' }, { type: 'image', source: { type: 'url', value: 'https://example.com/m.png' }, @@ -359,8 +395,8 @@ describe('Grok adapters', () => { await expect( adapter.generateImages({ model: 'grok-imagine-image', - prompt: 'Edit', - imageInputs: [ + prompt: [ + { type: 'text', content: 'Edit' }, { type: 'image', source: { type: 'url', value: 'https://example.com/a.png' }, diff --git a/packages/ai-openai/src/adapters/image.ts b/packages/ai-openai/src/adapters/image.ts index 4da3a2b68..0878b9258 100644 --- a/packages/ai-openai/src/adapters/image.ts +++ b/packages/ai-openai/src/adapters/image.ts @@ -1,4 +1,5 @@ import OpenAI from 'openai' +import { resolveMediaPrompt } from '@tanstack/ai' import { BaseImageAdapter } from '@tanstack/ai/adapters' import { toRunErrorPayload } from '@tanstack/ai/adapter-internals' import { buildImagesUsage } from '@tanstack/openai-base' @@ -20,6 +21,7 @@ import type { import type OpenAI_SDK from 'openai' import type { OpenAIImageModel } from '../model-meta' import type { + OpenAIImageModelInputModalitiesByName, OpenAIImageModelProviderOptionsByName, OpenAIImageModelSizeByName, OpenAIImageProviderOptions, @@ -58,7 +60,8 @@ export class OpenAIImageAdapter< TModel, OpenAIImageProviderOptions, OpenAIImageModelProviderOptionsByName, - OpenAIImageModelSizeByName + OpenAIImageModelSizeByName, + OpenAIImageModelInputModalitiesByName > { override readonly kind = 'image' as const readonly name = 'openai' as const @@ -73,40 +76,34 @@ export class OpenAIImageAdapter< async generateImages( options: ImageGenerationOptions, ): Promise { - const { - model, - prompt, - numberOfImages, - size, - modelOptions, - imageInputs, - videoInputs, - audioInputs, - } = options + const { model, numberOfImages, size, modelOptions } = options + + const resolved = resolveMediaPrompt(options.prompt) + const prompt = resolved.text validatePrompt({ prompt, model }) validateImageSize(model, size) validateNumberOfImages(model, numberOfImages) - if (videoInputs?.length) { + if (resolved.videos.length > 0) { throw new Error( - `${this.name}.generateImages does not support videoInputs (model: ${model}).`, + `${this.name}.generateImages does not support video prompt parts (model: ${model}).`, ) } - if (audioInputs?.length) { + if (resolved.audios.length > 0) { throw new Error( - `${this.name}.generateImages does not support audioInputs (model: ${model}).`, + `${this.name}.generateImages does not support audio prompt parts (model: ${model}).`, ) } - if (imageInputs && imageInputs.length > 0) { + if (resolved.images.length > 0) { return this.editImages({ model: model as OpenAIImageModel, prompt, numberOfImages, size, modelOptions, - imageInputs, + imageInputs: resolved.images, logger: options.logger, }) } @@ -193,7 +190,7 @@ export class OpenAIImageAdapter< const maxImages = EDIT_MAX_IMAGES[model] if (maxImages === 0) { throw new Error( - `${this.name}: model "${model}" does not support imageInputs. ` + + `${this.name}: model "${model}" does not support image prompt parts. ` + `Use gpt-image-1, gpt-image-1-mini, or dall-e-2 for image-conditioned generation.`, ) } @@ -212,7 +209,7 @@ export class OpenAIImageAdapter< } if (sourceParts.length === 0) { throw new Error( - `${this.name}: imageInputs contained only mask parts; at least one source image is required.`, + `${this.name}: the prompt contained only mask image parts; at least one source image is required.`, ) } if (sourceParts.length > maxImages) { diff --git a/packages/ai-openai/src/adapters/video.ts b/packages/ai-openai/src/adapters/video.ts index cfd596faf..47218502d 100644 --- a/packages/ai-openai/src/adapters/video.ts +++ b/packages/ai-openai/src/adapters/video.ts @@ -1,4 +1,5 @@ import OpenAI from 'openai' +import { resolveMediaPrompt } from '@tanstack/ai' import { BaseVideoAdapter } from '@tanstack/ai/adapters' import { toRunErrorPayload } from '@tanstack/ai/adapter-internals' import { arrayBufferToBase64 } from '@tanstack/ai-utils' @@ -18,6 +19,7 @@ import type { import type OpenAI_SDK from 'openai' import type { OpenAIVideoModel } from '../model-meta' import type { + OpenAIVideoModelInputModalitiesByName, OpenAIVideoModelProviderOptionsByName, OpenAIVideoModelSizeByName, OpenAIVideoProviderOptions, @@ -68,7 +70,8 @@ export class OpenAIVideoAdapter< TModel, OpenAIVideoProviderOptions, OpenAIVideoModelProviderOptionsByName, - OpenAIVideoModelSizeByName + OpenAIVideoModelSizeByName, + OpenAIVideoModelInputModalitiesByName > { readonly name = 'openai' as const @@ -88,36 +91,38 @@ export class OpenAIVideoAdapter< options: VideoGenerationOptions, ): Promise { const { model, size, duration, modelOptions } = options - const { imageInputs, videoInputs, audioInputs } = options validateVideoSize(model, size) const seconds = duration ?? modelOptions?.seconds validateVideoSeconds(model, seconds) - if (videoInputs?.length) { + const resolved = resolveMediaPrompt(options.prompt) + + if (resolved.videos.length > 0) { throw new Error( - `${this.name}.createVideoJob does not support videoInputs (model: ${model}).`, + `${this.name}.createVideoJob does not support video prompt parts (model: ${model}).`, ) } - if (audioInputs?.length) { + if (resolved.audios.length > 0) { throw new Error( - `${this.name}.createVideoJob does not support audioInputs (model: ${model}).`, + `${this.name}.createVideoJob does not support audio prompt parts (model: ${model}).`, ) } - if (imageInputs && imageInputs.length > 1) { + if (resolved.images.length > 1) { throw new Error( - `${this.name}: Sora accepts at most one input_reference image; received ${imageInputs.length}.`, + `${this.name}: Sora accepts at most one input_reference image; received ${resolved.images.length}.`, ) } const request: OpenAI_SDK.Videos.VideoCreateParams = { model, - prompt: options.prompt, + prompt: resolved.text, } - if (imageInputs && imageInputs[0]) { + const [inputReference] = resolved.images + if (inputReference) { // Sora's `input_reference` is a single Uploadable; convert TanStack // ImagePart (URL or base64) → File before handing it to the SDK. - const file = await imagePartToFile(imageInputs[0], 'input-reference') + const file = await imagePartToFile(inputReference, 'input-reference') ;(request as { input_reference?: unknown }).input_reference = file } // `VideoCreateParams.size` is `size?: VideoSize` (no `| undefined`), so we diff --git a/packages/ai-openai/src/image/image-provider-options.ts b/packages/ai-openai/src/image/image-provider-options.ts index d6e221ec3..729b40234 100644 --- a/packages/ai-openai/src/image/image-provider-options.ts +++ b/packages/ai-openai/src/image/image-provider-options.ts @@ -199,6 +199,19 @@ export type OpenAIImageModelSizeByName = { 'dall-e-2': DallE2Size } +/** + * Per-model prompt input modalities. Models with `images.edit()` support + * (gpt-image family, dall-e-2) accept image parts in the prompt; + * dall-e-3 has no edit endpoint, so its prompt is text-only at compile time. + */ +export type OpenAIImageModelInputModalitiesByName = { + 'gpt-image-2': readonly ['image'] + 'gpt-image-1': readonly ['image'] + 'gpt-image-1-mini': readonly ['image'] + 'dall-e-3': readonly [] + 'dall-e-2': readonly ['image'] +} + /** * Internal options interface for validation */ diff --git a/packages/ai-openai/src/video/video-provider-options.ts b/packages/ai-openai/src/video/video-provider-options.ts index b0f337039..837c2c7b2 100644 --- a/packages/ai-openai/src/video/video-provider-options.ts +++ b/packages/ai-openai/src/video/video-provider-options.ts @@ -66,6 +66,17 @@ export type OpenAIVideoModelSizeByName = { 'sora-2-pro': OpenAIVideoSize } +/** + * Per-model prompt input modalities. Sora models accept a single image part + * in the prompt, mapped to the API's `input_reference` field. + * + * @experimental Video generation is an experimental feature and may change. + */ +export type OpenAIVideoModelInputModalitiesByName = { + 'sora-2': readonly ['image'] + 'sora-2-pro': readonly ['image'] +} + /** * Validate video size for a given model. * diff --git a/packages/ai-openai/tests/image-adapter.test.ts b/packages/ai-openai/tests/image-adapter.test.ts index 767f3edf1..d8ab53086 100644 --- a/packages/ai-openai/tests/image-adapter.test.ts +++ b/packages/ai-openai/tests/image-adapter.test.ts @@ -238,13 +238,13 @@ describe('OpenAI Image Adapter', () => { }) }) - describe('imageInputs (image-conditioned generation)', () => { + describe('multimodal prompt (image-conditioned generation)', () => { const imagesEditResponse: OpenAI.Images.ImagesResponse = { created: 0, data: [{ b64_json: 'edited-base64' }], } - it('routes to images.edit() for gpt-image-1 when imageInputs is present', async () => { + it('routes to images.edit() for gpt-image-1 when the prompt has image parts', async () => { const adapter = new TestOpenAIImageAdapter( { apiKey: 'test-api-key' }, 'gpt-image-1', @@ -256,8 +256,8 @@ describe('OpenAI Image Adapter', () => { const result = await adapter.generateImages({ model: 'gpt-image-1', - prompt: 'Make it cinematic', - imageInputs: [ + prompt: [ + { type: 'text', content: 'Make it cinematic' }, { type: 'image', source: { @@ -279,7 +279,7 @@ describe('OpenAI Image Adapter', () => { expect(result.images[0]!.b64Json).toBe('edited-base64') }) - it('rejects dall-e-3 with a clear error when imageInputs is present', async () => { + it('rejects dall-e-3 with a clear error when the prompt has image parts', async () => { const adapter = new TestOpenAIImageAdapter( { apiKey: 'test-api-key' }, 'dall-e-3', @@ -288,8 +288,8 @@ describe('OpenAI Image Adapter', () => { await expect( adapter.generateImages({ model: 'dall-e-3', - prompt: 'edit', - imageInputs: [ + prompt: [ + { type: 'text', content: 'edit' }, { type: 'image', source: { type: 'data', value: 'aGk=', mimeType: 'image/png' }, @@ -297,7 +297,7 @@ describe('OpenAI Image Adapter', () => { ], logger: testLogger, }), - ).rejects.toThrow(/does not support imageInputs/) + ).rejects.toThrow(/does not support image prompt parts/) }) it('rejects dall-e-2 when more than one source image is provided', async () => { @@ -309,8 +309,8 @@ describe('OpenAI Image Adapter', () => { await expect( adapter.generateImages({ model: 'dall-e-2', - prompt: 'edit', - imageInputs: [ + prompt: [ + { type: 'text', content: 'edit' }, { type: 'image', source: { type: 'data', value: 'aGk=', mimeType: 'image/png' }, @@ -340,8 +340,8 @@ describe('OpenAI Image Adapter', () => { await adapter.generateImages({ model: 'gpt-image-1', - prompt: 'replace masked region', - imageInputs: [ + prompt: [ + { type: 'text', content: 'replace masked region' }, { type: 'image', source: { type: 'data', value: 'aGk=', mimeType: 'image/png' }, @@ -360,7 +360,7 @@ describe('OpenAI Image Adapter', () => { expect(editArgs.image).toBeInstanceOf(File) }) - it('rejects videoInputs or audioInputs', async () => { + it('rejects video or audio prompt parts', async () => { const adapter = new TestOpenAIImageAdapter( { apiKey: 'test-api-key' }, 'gpt-image-1', @@ -369,8 +369,8 @@ describe('OpenAI Image Adapter', () => { await expect( adapter.generateImages({ model: 'gpt-image-1', - prompt: 'x', - videoInputs: [ + prompt: [ + { type: 'text', content: 'x' }, { type: 'video', source: { type: 'url', value: 'https://example.com/v.mp4' }, @@ -378,13 +378,13 @@ describe('OpenAI Image Adapter', () => { ], logger: testLogger, }), - ).rejects.toThrow(/videoInputs/) + ).rejects.toThrow(/video prompt parts/) await expect( adapter.generateImages({ model: 'gpt-image-1', - prompt: 'x', - audioInputs: [ + prompt: [ + { type: 'text', content: 'x' }, { type: 'audio', source: { type: 'url', value: 'https://example.com/a.mp3' }, @@ -392,7 +392,7 @@ describe('OpenAI Image Adapter', () => { ], logger: testLogger, }), - ).rejects.toThrow(/audioInputs/) + ).rejects.toThrow(/audio prompt parts/) }) }) }) diff --git a/packages/ai-openrouter/src/adapters/image.ts b/packages/ai-openrouter/src/adapters/image.ts index 36ae14bb2..ced370e54 100644 --- a/packages/ai-openrouter/src/adapters/image.ts +++ b/packages/ai-openrouter/src/adapters/image.ts @@ -1,4 +1,5 @@ import { OpenRouter } from '@openrouter/sdk' +import { resolveMediaPrompt } from '@tanstack/ai' import { BaseImageAdapter } from '@tanstack/ai/adapters' import { getOpenRouterApiKeyFromEnv, @@ -7,6 +8,7 @@ import { import { buildOpenRouterUsage } from '../usage' import type { OpenRouterClientConfig } from '../utils' import type { + OpenRouterImageModelInputModalitiesByName, OpenRouterImageModelProviderOptionsByName, OpenRouterImageModelSizeByName, OpenRouterImageProviderOptions, @@ -58,7 +60,8 @@ export class OpenRouterImageAdapter< TModel, OpenRouterImageProviderOptions, OpenRouterImageModelProviderOptionsByName, - OpenRouterImageModelSizeByName + OpenRouterImageModelSizeByName, + OpenRouterImageModelInputModalitiesByName > { override readonly kind = 'image' as const readonly name = 'openrouter' as const @@ -77,33 +80,41 @@ export class OpenRouterImageAdapter< async generateImages( options: ImageGenerationOptions, ): Promise { - if (options.videoInputs?.length || options.audioInputs?.length) { + const resolved = resolveMediaPrompt(options.prompt) + + if (resolved.videos.length > 0 || resolved.audios.length > 0) { throw new Error( - `openrouter.generateImages does not support videoInputs / audioInputs on model ${this.model}.`, + `openrouter.generateImages does not support video / audio prompt parts on model ${this.model}.`, ) } - const { model, prompt, numberOfImages, size, modelOptions, logger } = - options + const { model, numberOfImages, size, modelOptions, logger } = options // Use provided aspect_ratio or derive from size const aspectRatio = size ? SIZE_TO_ASPECT_RATIO[size] : undefined - // Image-conditioned generation: inject inputs as multimodal content - // parts alongside the prompt. OpenRouter forwards them to the - // underlying image model (e.g. Gemini image models). Role hints carry - // no per-field semantics on the chat-completions pathway — inputs are - // attached in order, like the Gemini adapter's multimodal `contents`. - const imageInputs = options.imageInputs ?? [] + // Image-conditioned generation: map the prompt parts 1:1 onto + // chat-completions content parts, preserving the interleaved order — + // OpenRouter forwards them to the underlying image model (e.g. Gemini + // image models), where position is meaningful. Role hints carry no + // per-field semantics on this pathway. + type ContentItem = + | { type: 'text'; text: string } + | { type: 'image_url'; imageUrl: { url: string } } const content = - imageInputs.length > 0 - ? [ - { type: 'text' as const, text: prompt }, - ...imageInputs.map((part) => ({ - type: 'image_url' as const, - imageUrl: { url: imagePartToUrl(part) }, - })), - ] - : prompt + resolved.images.length > 0 + ? resolved.parts.flatMap((part): Array => { + if (part.type === 'text') { + return [{ type: 'text', text: part.content }] + } + if (part.type === 'image') { + return [ + { type: 'image_url', imageUrl: { url: imagePartToUrl(part) } }, + ] + } + // Video / audio parts were rejected above. + return [] + }) + : resolved.text logger.request( `activity=generateImage provider=openrouter model=${this.model}`, diff --git a/packages/ai-openrouter/src/image/image-provider-options.ts b/packages/ai-openrouter/src/image/image-provider-options.ts index b8974368f..3c4c00a4a 100644 --- a/packages/ai-openrouter/src/image/image-provider-options.ts +++ b/packages/ai-openrouter/src/image/image-provider-options.ts @@ -36,3 +36,13 @@ export type OpenRouterImageModelSizeByName = { | '1344×768' // "16:9" | '1536×672' // "21:9" } + +/** + * Per-model prompt input modalities. OpenRouter routes image generation + * through the chat-completions surface where every listed image model + * (Gemini image family, GPT image family) accepts `image_url` content + * parts, so image-conditioned prompts are supported across the board. + */ +export type OpenRouterImageModelInputModalitiesByName = { + [K in (typeof OPENROUTER_IMAGE_MODELS)[number]]: readonly ['image'] +} diff --git a/packages/ai-openrouter/tests/image-adapter.test.ts b/packages/ai-openrouter/tests/image-adapter.test.ts index d62a64fdd..4f078fd33 100644 --- a/packages/ai-openrouter/tests/image-adapter.test.ts +++ b/packages/ai-openrouter/tests/image-adapter.test.ts @@ -242,7 +242,7 @@ describe('OpenRouter Image Adapter', () => { ) }) - it('injects imageInputs as multimodal content parts', async () => { + it('maps image prompt parts onto content parts preserving interleaved order', async () => { const mockResponse = createMockImageResponse([ { url: 'https://example.com/edited.png' }, ]) @@ -253,12 +253,12 @@ describe('OpenRouter Image Adapter', () => { const result = await adapter.generateImages({ model: 'google/gemini-2.5-flash-image', - prompt: 'Turn this into a cinematic product photo', - imageInputs: [ + prompt: [ { type: 'image', source: { type: 'url', value: 'https://example.com/source.png' }, }, + { type: 'text', content: 'Turn this into a cinematic product photo' }, { type: 'image', source: { type: 'data', value: 'c3R5bGU=', mimeType: 'image/png' }, @@ -273,11 +273,11 @@ describe('OpenRouter Image Adapter', () => { { role: 'user', content: [ - { type: 'text', text: 'Turn this into a cinematic product photo' }, { type: 'image_url', imageUrl: { url: 'https://example.com/source.png' }, }, + { type: 'text', text: 'Turn this into a cinematic product photo' }, { type: 'image_url', imageUrl: { url: 'data:image/png;base64,c3R5bGU=' }, @@ -288,7 +288,7 @@ describe('OpenRouter Image Adapter', () => { expect(result.images).toHaveLength(1) }) - it('keeps a plain string prompt when no imageInputs are given', async () => { + it('keeps a plain string prompt when no image parts are given', async () => { const mockResponse = createMockImageResponse([ { url: 'https://example.com/image.png' }, ]) @@ -306,14 +306,14 @@ describe('OpenRouter Image Adapter', () => { expect(callArgs.messages[0].content).toBe('A plain prompt') }) - it('throws for videoInputs / audioInputs', async () => { + it('throws for video / audio prompt parts', async () => { const adapter = createAdapter() await expect( adapter.generateImages({ model: 'google/gemini-2.5-flash-image', - prompt: 'Test', - videoInputs: [ + prompt: [ + { type: 'text', content: 'Test' }, { type: 'video', source: { type: 'url', value: 'https://example.com/v.mp4' }, @@ -321,7 +321,7 @@ describe('OpenRouter Image Adapter', () => { ], logger: testLogger, }), - ).rejects.toThrow(/does not support videoInputs \/ audioInputs/) + ).rejects.toThrow(/does not support video \/ audio prompt parts/) }) it('passes imageConfig correctly', async () => { diff --git a/packages/ai/skills/ai-core/media-generation/SKILL.md b/packages/ai/skills/ai-core/media-generation/SKILL.md index 536ad7bd8..ed212e056 100644 --- a/packages/ai/skills/ai-core/media-generation/SKILL.md +++ b/packages/ai/skills/ai-core/media-generation/SKILL.md @@ -189,23 +189,30 @@ Result shape: `ImageGenerationResult` with `images` array where each entry has `b64Json?`, `url?`, and `revisedPrompt?`. OpenAI image URLs expire after 1 hour -- download or display immediately. -#### Image-conditioned generation: `imageInputs` / `videoInputs` / `audioInputs` - -Both `generateImage()` and `generateVideo()` accept multimodal conditioning -inputs that reuse the existing `ImagePart` / `VideoPart` / `AudioPart` -shape used elsewhere in TanStack AI. Each input may carry an optional +#### Image-conditioned generation: multimodal `prompt` parts + +Both `generateImage()` and `generateVideo()` accept the `prompt` either as +a plain string or as an ordered array of content parts (`TextPart` / +`ImagePart` / `VideoPart` / `AudioPart` — the same shapes used elsewhere in +TanStack AI). Part order is meaningful: natively multimodal providers +(Gemini, OpenRouter) receive parts in order; named-field providers (OpenAI, +fal, xAI) extract media parts and flatten the text. Prompt text is always +sent verbatim — to reference inputs from the prompt, write the provider's +own syntax (fal `@Image1`, OpenAI "image 1" prose); the SDK never injects +or rewrites markers. Each media part may carry an optional `metadata.role` hint that adapters use to route the part to the -provider-specific field. +provider-specific field. The accepted part types are narrowed per model at +compile time via the adapter's input-modality map. ```typescript -import { generateImage, type ImagePart } from '@tanstack/ai' +import { generateImage } from '@tanstack/ai' import { openaiImage } from '@tanstack/ai-openai' // Image-to-image (OpenAI gpt-image-1, dall-e-2) await generateImage({ adapter: openaiImage('gpt-image-1'), - prompt: 'Turn this into a cinematic product photo', - imageInputs: [ + prompt: [ + { type: 'text', content: 'Turn this into a cinematic product photo' }, { type: 'image', source: { type: 'url', value: 'https://…/product.png' } }, ], }) @@ -213,8 +220,8 @@ await generateImage({ // Multi-reference (up to 16 for gpt-image-1; up to 14 for Gemini native) await generateImage({ adapter: openaiImage('gpt-image-1'), - prompt: 'Apply the second image as style to the first', - imageInputs: [ + prompt: [ + { type: 'text', content: 'Apply the second image as style to the first' }, { type: 'image', source: { type: 'url', value: 'https://…/product.png' } }, { type: 'image', source: { type: 'url', value: 'https://…/style.png' } }, ], @@ -223,8 +230,8 @@ await generateImage({ // Inpaint via metadata.role === 'mask' (OpenAI gpt-image-1, dall-e-2; fal mask_url) await generateImage({ adapter: openaiImage('gpt-image-1'), - prompt: 'Replace the masked region with a tree', - imageInputs: [ + prompt: [ + { type: 'text', content: 'Replace the masked region with a tree' }, { type: 'image', source: { type: 'url', value: photoUrl } }, { type: 'image', @@ -240,9 +247,9 @@ import { falVideo } from '@tanstack/ai-fal' await generateVideo({ adapter: falVideo('fal-ai/kling-video/v3/pro/image-to-video'), - prompt: 'Slow cinematic push-in', - imageInputs: [ + prompt: [ { type: 'image', source: { type: 'url', value: firstFrameUrl } }, + { type: 'text', content: 'Slow cinematic push-in' }, { type: 'image', source: { type: 'url', value: lastFrameUrl }, @@ -265,16 +272,16 @@ await generateVideo({ **Provider support matrix:** -| Provider | `generateImage` `imageInputs` | `generateVideo` `imageInputs` | +| Provider | `generateImage` image parts | `generateVideo` image parts | | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | OpenAI | gpt-image-1 / -mini → `images.edit()` (up to 16). dall-e-2 → edit (1). dall-e-3 throws. | Sora-2 / -pro → `input_reference` (single). Throws if >1. | | Gemini | Native (gemini-\*-flash-image, "nano-banana") → multimodal `contents`. Imagen throws. | No native Veo adapter yet — deferred to a follow-up. | | fal | Per-endpoint field names from a generated map (`pnpm generate:fal-image-fields`). Defaults: 1 input → `image_url`; >1 → `image_urls`; roles → `mask_url` / `control_image_url` / `reference_image_urls`. | Per-endpoint map (e.g. Kling i2v start frame → `image_url`). Defaults: 1 input → `image_url`; `start_frame`/`end_frame` → `start_image_url`/`end_image_url`; `reference` → `reference_image_urls`. | -| Grok | grok-imagine models → `/v1/images/edits` JSON endpoint (≤3 sources, `` prompt refs; mask/control throw). grok-2-image-1212 throws. | n/a | -| OpenRouter | Inputs injected as multimodal `image_url` content parts in the chat-completions message. | n/a | +| Grok | grok-imagine models → `/v1/images/edits` JSON endpoint (≤3 sources, addressed by xAI in request order; prompt sent verbatim; mask/control throw). grok-2-image-1212 throws. | n/a | +| OpenRouter | Prompt parts map 1:1 onto multimodal `text` / `image_url` content parts, preserving interleaved order. | n/a | | Anthropic | n/a (no image generation API). | n/a | -`videoInputs` and `audioInputs` follow the same `metadata.role` convention +Video and audio prompt parts follow the same `metadata.role` convention for video-to-video and lipsync flows on fal; other providers throw when they're passed. @@ -696,39 +703,48 @@ generateSpeech({ > Source: Gemini TTS adapter validation; CodeRabbit review of PR #463. -### h. HIGH: Passing `imageInputs` to a model that doesn't support image-conditioned generation +### h. HIGH: Passing image prompt parts to a model that doesn't support image-conditioned generation -Not every model accepts image-conditioned inputs. Adapters throw a clear -runtime error when the caller passes `imageInputs` to a model that -can't honor it (dall-e-3, Imagen, Grok, OpenRouter), so users learn at -call time rather than getting silently wrong output. +Not every model accepts image-conditioned prompts. The `prompt` type is +narrowed per model, so passing an image part to a text-only model +(dall-e-3, Imagen, grok-2-image) is a **compile-time error**; adapters +also throw a clear runtime error as a backstop, so users learn at call +time rather than getting silently wrong output. ```typescript -// WRONG — dall-e-3 has no edit/inputs API +// WRONG — dall-e-3 has no edit/inputs API; image parts are a type error generateImage({ adapter: openaiImage('dall-e-3'), - prompt: 'Edit this', - imageInputs: [{ type: 'image', source: { type: 'url', value: url } }], -}) // throws: model "dall-e-3" does not support imageInputs. + prompt: [ + { type: 'text', content: 'Edit this' }, + { type: 'image', source: { type: 'url', value: url } }, // ❌ type error + ], +}) -// WRONG — Imagen is text-to-image only +// WRONG — Imagen is text-to-image only; same compile-time rejection generateImage({ adapter: geminiImage('imagen-4.0-generate-001'), - prompt: 'Edit this', - imageInputs: [{ type: 'image', source: { type: 'url', value: url } }], -}) // throws: Imagen does not support imageInputs. + prompt: [ + { type: 'text', content: 'Edit this' }, + { type: 'image', source: { type: 'url', value: url } }, // ❌ type error + ], +}) -// CORRECT — use a model that supports edits/inputs +// CORRECT — use a model that supports image-conditioned generation generateImage({ adapter: openaiImage('gpt-image-1'), // edits up to 16 images - prompt: 'Edit this', - imageInputs: [{ type: 'image', source: { type: 'url', value: url } }], + prompt: [ + { type: 'text', content: 'Edit this' }, + { type: 'image', source: { type: 'url', value: url } }, + ], }) generateImage({ adapter: geminiImage('gemini-3.1-flash-image-preview'), // native multimodal - prompt: 'Edit this', - imageInputs: [{ type: 'image', source: { type: 'url', value: url } }], + prompt: [ + { type: 'text', content: 'Edit this' }, + { type: 'image', source: { type: 'url', value: url } }, + ], }) ``` diff --git a/packages/ai/src/activities/generateImage/adapter.ts b/packages/ai/src/activities/generateImage/adapter.ts index 532bc7d6d..cbc24b72c 100644 --- a/packages/ai/src/activities/generateImage/adapter.ts +++ b/packages/ai/src/activities/generateImage/adapter.ts @@ -1,4 +1,8 @@ -import type { ImageGenerationOptions, ImageGenerationResult } from '../../types' +import type { + ImageGenerationOptions, + ImageGenerationResult, + ModelInputModalitiesByName, +} from '../../types' /** * Resolve the size type for a model from the model-size map. @@ -29,6 +33,8 @@ export interface ImageAdapterConfig { * - TProviderOptions: Base provider-specific options (already resolved) * - TModelProviderOptionsByName: Map from model name to its specific provider options * - TModelSizeByName: Map from model name to its supported sizes + * - TModelInputModalitiesByName: Map from model name to the non-text prompt + * modalities it accepts (constrains the `prompt` part types at compile time) */ export interface ImageAdapter< TModel extends string = string, @@ -38,6 +44,8 @@ export interface ImageAdapter< string, string >, + TModelInputModalitiesByName extends ModelInputModalitiesByName = + ModelInputModalitiesByName, > { /** Discriminator for adapter kind - used by generate() to determine API shape */ readonly kind: 'image' @@ -53,6 +61,7 @@ export interface ImageAdapter< providerOptions: TProviderOptions modelProviderOptionsByName: TModelProviderOptionsByName modelSizeByName: TModelSizeByName + modelInputModalitiesByName: TModelInputModalitiesByName } /** @@ -67,7 +76,7 @@ export interface ImageAdapter< * An ImageAdapter with any/unknown type parameters. * Useful as a constraint in generic functions and interfaces. */ -export type AnyImageAdapter = ImageAdapter +export type AnyImageAdapter = ImageAdapter /** * Abstract base class for image generation adapters. @@ -83,11 +92,14 @@ export abstract class BaseImageAdapter< string, string >, + TModelInputModalitiesByName extends ModelInputModalitiesByName = + ModelInputModalitiesByName, > implements ImageAdapter< TModel, TProviderOptions, TModelProviderOptionsByName, - TModelSizeByName + TModelSizeByName, + TModelInputModalitiesByName > { readonly kind = 'image' as const abstract readonly name: string @@ -98,6 +110,7 @@ export abstract class BaseImageAdapter< providerOptions: TProviderOptions modelProviderOptionsByName: TModelProviderOptionsByName modelSizeByName: TModelSizeByName + modelInputModalitiesByName: TModelInputModalitiesByName } protected config: ImageAdapterConfig diff --git a/packages/ai/src/activities/generateImage/index.ts b/packages/ai/src/activities/generateImage/index.ts index 285fb6a56..a1e399533 100644 --- a/packages/ai/src/activities/generateImage/index.ts +++ b/packages/ai/src/activities/generateImage/index.ts @@ -8,16 +8,15 @@ import { aiEventClient } from '@tanstack/ai-event-client' import { streamGenerationResult } from '../stream-generation-result.js' import { resolveDebugOption } from '../../logger/resolve' +import { resolveMediaPrompt } from '../../utilities/media-prompt' import type { InternalLogger } from '../../logger/internal-logger' import type { DebugOption } from '../../logger/types' import type { ImageAdapter } from './adapter' import type { - AudioPart, ImageGenerationResult, - ImagePart, - MediaInputMetadata, + MediaPrompt, + MediaPromptFor, StreamChunk, - VideoPart, } from '../../types' // =========================== @@ -62,6 +61,23 @@ export type ImageSizeForModel = : string : string +/** + * Extract the prompt type a model accepts from an ImageAdapter via ~types. + * Adapters declare a per-model input-modality map; models in the map get a + * `prompt` narrowed to text + their supported part types (text-only models + * accept `string | Array`), so unsupported media parts fail at + * compile time. Adapters without a map fall back to the full MediaPrompt. + */ +export type ImagePromptForModel = + TAdapter extends ImageAdapter + ? string extends keyof ModsByName + ? // No explicit map - accept the full union + MediaPrompt + : TModel extends keyof ModsByName + ? MediaPromptFor + : MediaPrompt + : MediaPrompt + // =========================== // Activity Options Type // =========================== @@ -79,23 +95,20 @@ export type ImageActivityOptions< > = { /** The image adapter to use (must be created with a model) */ adapter: TAdapter & { kind: typeof kind } - /** Text description of the desired image(s) */ - prompt: string + /** + * Description of the desired image(s). Either a plain string, or — for + * models that support image-conditioned generation — an ordered array of + * content parts interleaving text with image inputs (image-to-image, + * reference-guided, edit, multi-reference). Media parts may carry + * `metadata.role` (`'reference' | 'mask' | 'control' | 'character'`) to + * disambiguate intent. The accepted part types are narrowed per model via + * the adapter's input-modality map. + */ + prompt: ImagePromptForModel /** Number of images to generate (default: 1) */ numberOfImages?: number /** Image size in WIDTHxHEIGHT format (e.g., "1024x1024") */ size?: ImageSizeForModel - /** - * Image conditioning inputs for image-to-image, reference-guided, edit, or - * multi-reference generation. Each part may carry `metadata.role` - * (`'reference' | 'mask' | 'control' | 'character'`) to disambiguate intent. - * Adapters that don't support image-conditioned generation throw clearly. - */ - imageInputs?: Array> - /** Video conditioning inputs. Provider support varies; unsupported adapters throw. */ - videoInputs?: Array> - /** Audio conditioning inputs. Provider support varies; unsupported adapters throw. */ - audioInputs?: Array> /** * Whether to stream the image generation result. * When true, returns an AsyncIterable for streaming transport. @@ -221,16 +234,26 @@ async function runGenerateImage< const startTime = Date.now() const logger: InternalLogger = resolveDebugOption(options.debug) + // Devtools events carry the flattened prompt text plus media-part counts — + // the wire payload stays `prompt: string` regardless of the prompt shape. + const resolved = resolveMediaPrompt(rest.prompt) + aiEventClient.emit('image:request:started', { requestId, provider: adapter.name, model, - prompt: rest.prompt, + prompt: resolved.text, numberOfImages: rest.numberOfImages, size: rest.size, - imageInputCount: rest.imageInputs?.length, - videoInputCount: rest.videoInputs?.length, - audioInputCount: rest.audioInputs?.length, + ...(resolved.images.length > 0 && { + imageInputCount: resolved.images.length, + }), + ...(resolved.videos.length > 0 && { + videoInputCount: resolved.videos.length, + }), + ...(resolved.audios.length > 0 && { + audioInputCount: resolved.audios.length, + }), modelOptions: rest.modelOptions, timestamp: startTime, }) diff --git a/packages/ai/src/activities/generateVideo/adapter.ts b/packages/ai/src/activities/generateVideo/adapter.ts index a0b4b4389..4f0eaed21 100644 --- a/packages/ai/src/activities/generateVideo/adapter.ts +++ b/packages/ai/src/activities/generateVideo/adapter.ts @@ -1,4 +1,5 @@ import type { + ModelInputModalitiesByName, VideoGenerationOptions, VideoJobResult, VideoStatusResult, @@ -31,6 +32,8 @@ export interface VideoAdapterConfig { * - TProviderOptions: Provider-specific options (already resolved) * - TModelProviderOptionsByName: Map from model name to its specific provider options * - TModelSizeByName: Map from model name to its supported sizes + * - TModelInputModalitiesByName: Map from model name to the non-text prompt + * modalities it accepts (constrains the `prompt` part types at compile time) */ export interface VideoAdapter< TModel extends string = string, @@ -40,6 +43,8 @@ export interface VideoAdapter< string, string >, + TModelInputModalitiesByName extends ModelInputModalitiesByName = + ModelInputModalitiesByName, > { /** Discriminator for adapter kind - used to determine API shape */ readonly kind: 'video' @@ -55,6 +60,7 @@ export interface VideoAdapter< providerOptions: TProviderOptions modelProviderOptionsByName: TModelProviderOptionsByName modelSizeByName: TModelSizeByName + modelInputModalitiesByName: TModelInputModalitiesByName } /** @@ -81,7 +87,7 @@ export interface VideoAdapter< * A VideoAdapter with any/unknown type parameters. * Useful as a constraint in generic functions and interfaces. */ -export type AnyVideoAdapter = VideoAdapter +export type AnyVideoAdapter = VideoAdapter /** * Abstract base class for video generation adapters. @@ -99,11 +105,14 @@ export abstract class BaseVideoAdapter< string, string >, + TModelInputModalitiesByName extends ModelInputModalitiesByName = + ModelInputModalitiesByName, > implements VideoAdapter< TModel, TProviderOptions, TModelProviderOptionsByName, - TModelSizeByName + TModelSizeByName, + TModelInputModalitiesByName > { readonly kind = 'video' as const abstract readonly name: string @@ -114,6 +123,7 @@ export abstract class BaseVideoAdapter< providerOptions: TProviderOptions modelProviderOptionsByName: TModelProviderOptionsByName modelSizeByName: TModelSizeByName + modelInputModalitiesByName: TModelInputModalitiesByName } protected config: VideoAdapterConfig diff --git a/packages/ai/src/activities/generateVideo/index.ts b/packages/ai/src/activities/generateVideo/index.ts index 572759cdf..cc942a5a2 100644 --- a/packages/ai/src/activities/generateVideo/index.ts +++ b/packages/ai/src/activities/generateVideo/index.ts @@ -14,13 +14,11 @@ import type { InternalLogger } from '../../logger/internal-logger' import type { DebugOption } from '../../logger/types' import type { VideoAdapter } from './adapter' import type { - AudioPart, - ImagePart, - MediaInputMetadata, + MediaPrompt, + MediaPromptFor, StreamChunk, TokenUsage, VideoJobResult, - VideoPart, VideoStatusResult, VideoUrlResult, } from '../../types' @@ -54,6 +52,21 @@ export type VideoSizeForAdapter = : string : string +/** + * Extract the prompt type a model accepts from a VideoAdapter via ~types. + * Mirrors `ImagePromptForModel`: models in the adapter's input-modality map + * get a `prompt` narrowed to text + their supported part types; adapters + * without a map fall back to the full MediaPrompt. + */ +export type VideoPromptForAdapter = + TAdapter extends VideoAdapter + ? string extends keyof ModsByName + ? MediaPrompt + : TModel extends keyof ModsByName + ? MediaPromptFor + : MediaPrompt + : MediaPrompt + // =========================== // Activity Options Types @@ -88,22 +101,20 @@ export type VideoCreateOptions< > = VideoActivityBaseOptions & { /** Request type - create a new job (default if not specified) */ request?: 'create' - /** Text description of the desired video */ - prompt: string + /** + * Description of the desired video. Either a plain string, or — for models + * that support image-conditioned generation — an ordered array of content + * parts interleaving text with image inputs. Image parts may carry + * `metadata.role` (`'start_frame' | 'end_frame' | 'reference' | + * 'character'`) to disambiguate intent; positional fallback otherwise. The + * accepted part types are narrowed per model via the adapter's + * input-modality map. + */ + prompt: VideoPromptForAdapter /** Video size — format depends on the provider (e.g., "16:9", "1280x720") */ size?: VideoSizeForAdapter /** Video duration in seconds */ duration?: number - /** - * Image conditioning inputs (start frame, end frame, reference / character - * images). Use `metadata.role` (`'start_frame' | 'end_frame' | 'reference' | - * 'character'`) to disambiguate intent; positional fallback otherwise. - */ - imageInputs?: Array> - /** Video conditioning inputs (video-to-video, source clip). */ - videoInputs?: Array> - /** Audio conditioning inputs (lipsync source, voice reference). */ - audioInputs?: Array> /** * Whether to stream the video generation lifecycle. * When true, returns an AsyncIterable that handles the full @@ -264,16 +275,7 @@ export function generateVideo< async function runCreateVideoJob< TAdapter extends VideoAdapter, >(options: VideoCreateOptions): Promise { - const { - adapter, - prompt, - size, - duration, - modelOptions, - imageInputs, - videoInputs, - audioInputs, - } = options + const { adapter, prompt, size, duration, modelOptions } = options const model = adapter.model const logger: InternalLogger = resolveDebugOption(options.debug) const providerName = @@ -293,9 +295,6 @@ async function runCreateVideoJob< size, duration, modelOptions, - imageInputs, - videoInputs, - audioInputs, logger, }) logger.output(`activity=generateVideo jobId=${result.jobId}`, { @@ -323,16 +322,7 @@ function sleep(ms: number): Promise { async function* runStreamingVideoGeneration< TAdapter extends VideoAdapter, >(options: VideoCreateOptions): AsyncIterable { - const { - adapter, - prompt, - size, - duration, - modelOptions, - imageInputs, - videoInputs, - audioInputs, - } = options + const { adapter, prompt, size, duration, modelOptions } = options const model = adapter.model const runId = options.runId ?? createId('run') const pollingInterval = options.pollingInterval ?? 2000 @@ -368,9 +358,6 @@ async function* runStreamingVideoGeneration< size, duration, modelOptions, - imageInputs, - videoInputs, - audioInputs, logger, }) diff --git a/packages/ai/src/index.ts b/packages/ai/src/index.ts index dbb38722b..1872d264c 100644 --- a/packages/ai/src/index.ts +++ b/packages/ai/src/index.ts @@ -124,6 +124,10 @@ export * from './types' // Usage utilities export { buildBaseUsage, type BaseUsageInput } from './utilities/usage' +// Media-generation prompt resolution (used by image / video adapters) +export { resolveMediaPrompt } from './utilities/media-prompt' +export type { ResolvedMediaPrompt } from './utilities/media-prompt' + // System prompts (type + normaliser used by adapters) export type { SystemPrompt, NormalizedSystemPrompt } from './system-prompts' export { normalizeSystemPrompts } from './system-prompts' diff --git a/packages/ai/src/types.ts b/packages/ai/src/types.ts index 1c48f289b..7ab506016 100644 --- a/packages/ai/src/types.ts +++ b/packages/ai/src/types.ts @@ -1493,8 +1493,76 @@ export type MediaInputRole = export interface MediaInputMetadata { /** Optional role hint disambiguating the part's intent for the adapter */ role?: MediaInputRole + /** + * Optional user-defined label for this input (e.g. `'woman-in-red-dress'`). + * **Informational only** — adapters never read it and the SDK never + * rewrites prompt text based on it. Use it to correlate parts with the + * references you write in your prompt using the provider's own syntax + * (fal's `@Image1`, OpenAI's "image 1", etc.), or for your own + * bookkeeping/logging. + */ + tag?: string +} + +/** + * A single part of a multimodal media-generation prompt. Reuses the chat + * content-part shapes: text parts carry the instruction, image / video / + * audio parts carry conditioning inputs (with an optional + * `metadata.role` hint — see {@link MediaInputRole}). + */ +export type MediaPromptPart = + | TextPart + | ImagePart + | VideoPart + | AudioPart + +/** + * Prompt accepted by `generateImage()` / `generateVideo()`: a plain string, + * or an ordered array of content parts for image-conditioned generation + * ("not like this *(image)*, more like this *(image)*"). Part order is + * meaningful — adapters with native multimodal prompts (Gemini, OpenRouter) + * preserve the interleaving; named-field providers (fal, OpenAI, xAI) + * extract the media parts and flatten the text. Text is always sent + * verbatim: to reference inputs from the prompt, write the provider's own + * syntax yourself (e.g. fal's `@Image1`, OpenAI's "image 1"). An array may + * be media-only (e.g. upscalers or pure img2img endpoints that take no + * instruction text). + */ +export type MediaPrompt = string | Array + +/** + * Non-text modalities a media-generation model can accept in its prompt. + */ +export type MediaPromptModality = 'image' | 'video' | 'audio' + +/** Maps a prompt modality to its content-part type. @internal */ +interface MediaPartByModality { + image: ImagePart + video: VideoPart + audio: AudioPart } +/** + * Prompt type narrowed to the modalities a specific model supports. + * `MediaPromptFor` (a text-only model) is `string | Array`; + * `MediaPromptFor<'image'>` additionally admits image parts, etc. Used by + * the activity option types together with the adapter's per-model input + * modality map so unsupported parts fail at compile time. + */ +export type MediaPromptFor = + | string + | Array + +/** + * Per-model map from model name to the prompt modalities it accepts, used as + * an adapter type parameter (`TModelInputModalitiesByName`). Models absent + * from the map fall back to the unconstrained {@link MediaPrompt}. + */ +export type ModelInputModalitiesByName = Record< + string, + ReadonlyArray +> + /** * Options for image generation. * These are the common options supported across providers. @@ -1505,31 +1573,20 @@ export interface ImageGenerationOptions< > { /** The model to use for image generation */ model: string - /** Text description of the desired image(s) */ - prompt: string + /** + * Description of the desired image(s): a plain string, or an ordered array + * of content parts for image-conditioned generation (image-to-image, + * reference-guided, edit, multi-reference). Media parts may carry + * `metadata.role` to disambiguate intent (mask, control, reference, …). + * Adapters map parts onto the provider-native request — e.g. Gemini + * multimodal `contents`, OpenAI `images.edit()`, fal `image_url` / + * `mask_url` — and throw a clear runtime error for unsupported modalities. + */ + prompt: MediaPrompt /** Number of images to generate (default: 1) */ numberOfImages?: number /** Image size in WIDTHxHEIGHT format (e.g., "1024x1024") */ size?: TSize - /** - * Image conditioning inputs (reference / mask / control / start frame / - * character). Reuses the multimodal `ImagePart` shape. Adapters map these - * onto the provider-native request — e.g. OpenAI `images.edit()`, Gemini - * multimodal `contents`, fal `image_url` / `image_urls` / `mask_url`. - * Adapters that do not support image-conditioned generation throw a clear - * runtime error when this field is non-empty. - */ - imageInputs?: Array> - /** - * Video conditioning inputs (video-to-video, edit, lipsync source). - * Not all providers support this; adapters throw when unsupported. - */ - videoInputs?: Array> - /** - * Audio conditioning inputs (audio reference, voice cloning, lipsync). - * Not all providers support this; adapters throw when unsupported. - */ - audioInputs?: Array> /** Model-specific options for image generation */ modelOptions?: TProviderOptions /** @@ -1646,30 +1703,19 @@ export interface VideoGenerationOptions< > { /** The model to use for video generation */ model: string - /** Text description of the desired video */ - prompt: string + /** + * Description of the desired video: a plain string, or an ordered array of + * content parts for image-conditioned generation. Image parts may carry + * `metadata.role` (`'start_frame' | 'end_frame' | 'reference' | + * 'character'`) to disambiguate intent; adapters route them onto the + * provider-native request (e.g. OpenAI Sora `input_reference`, fal + * `image_url` / `end_image_url`) and throw at runtime if unsupported. + */ + prompt: MediaPrompt /** Video size — format depends on the provider (e.g., "16:9", "1280x720") */ size?: TSize /** Video duration in seconds */ duration?: number - /** - * Image conditioning inputs (start frame, end frame, character / reference - * images). Reuses the multimodal `ImagePart` shape; adapters route by - * `metadata.role` and array position (e.g. OpenAI Sora `input_reference`, - * fal `image_url` / `end_image_url`, Veo `image` / `lastFrame` / - * `referenceImages`). Adapters throw at runtime if unsupported. - */ - imageInputs?: Array> - /** - * Video conditioning inputs (video-to-video edit, source clip). - * Not all providers support this; adapters throw when unsupported. - */ - videoInputs?: Array> - /** - * Audio conditioning inputs (lipsync source, voice reference). - * Not all providers support this; adapters throw when unsupported. - */ - audioInputs?: Array> /** Model-specific options for video generation */ modelOptions?: TProviderOptions /** diff --git a/packages/ai/src/utilities/media-prompt.ts b/packages/ai/src/utilities/media-prompt.ts new file mode 100644 index 000000000..9cefb64ff --- /dev/null +++ b/packages/ai/src/utilities/media-prompt.ts @@ -0,0 +1,86 @@ +import type { + AudioPart, + ImagePart, + MediaInputMetadata, + MediaPrompt, + MediaPromptPart, + TextPart, + VideoPart, +} from '../types' + +/** + * A {@link MediaPrompt} decomposed into the views adapters consume. + * + * Adapters with native multimodal prompts (Gemini `contents`, OpenRouter + * chat content parts) consume `parts` to preserve interleaving; named-field + * providers (fal, OpenAI) consume `text` plus the typed media buckets. + * + * Prompt text is **never rewritten**: text parts are concatenated verbatim. + * Providers that support referencing inputs from the prompt (e.g. fal's + * `@Image1`, OpenAI's "image 1" prose) expect the user to write that syntax + * themselves — the SDK does not inject or substitute markers. + */ +export interface ResolvedMediaPrompt { + /** + * Text parts concatenated verbatim (paragraph-separated). Empty string + * for media-only prompts. + */ + text: string + /** The prompt as ordered parts; a string prompt becomes one text part. */ + parts: Array + /** Image parts in prompt order. */ + images: Array> + /** Video parts in prompt order. */ + videos: Array> + /** Audio parts in prompt order. */ + audios: Array> +} + +/** + * Decompose a {@link MediaPrompt} into flattened text and per-modality part + * buckets, preserving prompt order everywhere. This is the single downrev + * point from the canonical interleaved prompt shape to the named-field + * request shapes most providers expose. + */ +export function resolveMediaPrompt(prompt: MediaPrompt): ResolvedMediaPrompt { + if (typeof prompt === 'string') { + const textPart: TextPart = { type: 'text', content: prompt } + return { + text: prompt, + parts: [textPart], + images: [], + videos: [], + audios: [], + } + } + + const images: Array> = [] + const videos: Array> = [] + const audios: Array> = [] + const textSegments: Array = [] + + for (const part of prompt) { + switch (part.type) { + case 'text': + if (part.content) textSegments.push(part.content) + break + case 'image': + images.push(part) + break + case 'video': + videos.push(part) + break + case 'audio': + audios.push(part) + break + } + } + + return { + text: textSegments.join('\n\n'), + parts: prompt, + images, + videos, + audios, + } +} diff --git a/packages/ai/tests/image-per-model-type-safety.test.ts b/packages/ai/tests/image-per-model-type-safety.test.ts index db67dd048..aeb79d621 100644 --- a/packages/ai/tests/image-per-model-type-safety.test.ts +++ b/packages/ai/tests/image-per-model-type-safety.test.ts @@ -169,6 +169,16 @@ type MockImageModelSizeByName = { 'mock-dall-e-3': MockDallE3Size } +/** + * Type map: model name -> supported prompt input modalities. + * mock-gpt-image-1 accepts image-conditioned prompts; mock-dall-e-3 is + * text-to-image only. + */ +type MockImageModelInputModalitiesByName = { + 'mock-gpt-image-1': readonly ['image'] + 'mock-dall-e-3': readonly [] +} + // =========================== // Mock Model Definitions // =========================== @@ -199,7 +209,8 @@ class MockImageAdapter extends BaseImageAdapter< TModel, MockImageProviderOptions, MockImageModelProviderOptionsByName, - MockImageModelSizeByName + MockImageModelSizeByName, + MockImageModelInputModalitiesByName > { override readonly kind = 'image' as const readonly name = 'mock' as const @@ -850,3 +861,58 @@ describe('Model Size Type Assertions', () => { }) }) }) + +describe('Per-model prompt modality type safety', () => { + it('allows image parts in the prompt for image-input models', () => { + generateImage({ + adapter: mockImage('mock-gpt-image-1'), + prompt: [ + { type: 'text', content: 'Make it cinematic' }, + { + type: 'image', + source: { type: 'url', value: 'https://example.com/ref.png' }, + metadata: { role: 'reference' }, + }, + ], + }) + }) + + it('rejects image parts in the prompt for text-only models', () => { + generateImage({ + adapter: mockImage('mock-dall-e-3'), + prompt: [ + { type: 'text', content: 'A cat' }, + { + // @ts-expect-error - mock-dall-e-3 does not accept image prompt parts + type: 'image', + source: { type: 'url', value: 'https://example.com/ref.png' }, + }, + ], + }) + }) + + it('rejects video parts for models that only accept image inputs', () => { + generateImage({ + adapter: mockImage('mock-gpt-image-1'), + prompt: [ + { type: 'text', content: 'Animate' }, + { + // @ts-expect-error - mock-gpt-image-1 does not accept video prompt parts + type: 'video', + source: { type: 'url', value: 'https://example.com/v.mp4' }, + }, + ], + }) + }) + + it('always accepts plain string prompts', () => { + generateImage({ + adapter: mockImage('mock-gpt-image-1'), + prompt: 'A cat', + }) + generateImage({ + adapter: mockImage('mock-dall-e-3'), + prompt: 'A cat', + }) + }) +}) diff --git a/packages/ai/tests/media-prompt.test.ts b/packages/ai/tests/media-prompt.test.ts new file mode 100644 index 000000000..18bd1dc12 --- /dev/null +++ b/packages/ai/tests/media-prompt.test.ts @@ -0,0 +1,79 @@ +import { describe, expect, it } from 'vitest' +import { resolveMediaPrompt } from '../src/utilities/media-prompt' +import type { ImagePart, MediaInputMetadata, MediaPromptPart } from '../src' + +function image( + value: string, + role?: NonNullable, +): ImagePart { + return { + type: 'image', + source: { type: 'url', value }, + ...(role && { metadata: { role } }), + } +} + +describe('resolveMediaPrompt', () => { + it('wraps a string prompt as a single text part', () => { + const resolved = resolveMediaPrompt('a cat') + expect(resolved.text).toBe('a cat') + expect(resolved.parts).toEqual([{ type: 'text', content: 'a cat' }]) + expect(resolved.images).toEqual([]) + expect(resolved.videos).toEqual([]) + expect(resolved.audios).toEqual([]) + }) + + it('buckets media parts by modality in prompt order', () => { + const parts: Array = [ + image('https://a.png'), + { type: 'text', content: 'animate this' }, + { type: 'video', source: { type: 'url', value: 'https://v.mp4' } }, + { type: 'audio', source: { type: 'url', value: 'https://a.mp3' } }, + image('https://b.png', 'end_frame'), + ] + const resolved = resolveMediaPrompt(parts) + expect(resolved.text).toBe('animate this') + expect(resolved.parts).toBe(parts) + expect(resolved.images.map((p) => p.source.value)).toEqual([ + 'https://a.png', + 'https://b.png', + ]) + expect(resolved.images[1]?.metadata?.role).toBe('end_frame') + expect(resolved.videos).toHaveLength(1) + expect(resolved.audios).toHaveLength(1) + }) + + it('joins multiple text parts with paragraph breaks', () => { + const resolved = resolveMediaPrompt([ + { type: 'text', content: 'first' }, + image('https://a.png'), + { type: 'text', content: 'second' }, + ]) + expect(resolved.text).toBe('first\n\nsecond') + }) + + it('returns empty text for media-only prompts', () => { + const resolved = resolveMediaPrompt([image('https://a.png')]) + expect(resolved.text).toBe('') + expect(resolved.images).toHaveLength(1) + }) + + it('skips empty text parts', () => { + const resolved = resolveMediaPrompt([ + { type: 'text', content: '' }, + { type: 'text', content: 'real' }, + ]) + expect(resolved.text).toBe('real') + }) + + it('never rewrites text — provider referencing syntax passes through verbatim', () => { + const resolved = resolveMediaPrompt([ + { + type: 'text', + content: 'Put @Image1 next to from image 1', + }, + image('https://a.png'), + ]) + expect(resolved.text).toBe('Put @Image1 next to from image 1') + }) +}) diff --git a/testing/e2e/src/lib/feature-support.ts b/testing/e2e/src/lib/feature-support.ts index e43acf0d6..54ca2990f 100644 --- a/testing/e2e/src/lib/feature-support.ts +++ b/testing/e2e/src/lib/feature-support.ts @@ -178,9 +178,9 @@ export const matrix: Record> = { ]), // Gemini excluded: aimock doesn't mock Gemini's Imagen predict endpoint format 'image-gen': new Set(['openai', 'grok']), - // image-to-image (imageInputs on generateImage) routes adapters to wire - // endpoints aimock doesn't yet mock (OpenAI `/v1/images/edits`, Gemini - // multimodal `generateContent`, xAI `/v1/images/edits`, OpenRouter + // image-to-image (image parts in the generateImage prompt) routes adapters + // to wire endpoints aimock doesn't yet mock (OpenAI `/v1/images/edits`, + // Gemini multimodal `generateContent`, xAI `/v1/images/edits`, OpenRouter // multimodal chat content parts, fal endpoint-specific input fields). // Adapter-level mapping is covered by unit tests. Populate this set when // aimock gains support for those endpoints. @@ -190,9 +190,9 @@ export const matrix: Record> = { tts: new Set(['openai', 'grok', 'elevenlabs']), transcription: new Set(['openai', 'grok', 'elevenlabs']), 'video-gen': new Set(['openai']), - // image-to-video (imageInputs on generateVideo) similarly depends on - // aimock mocking Sora's `input_reference` upload field. Populate when - // aimock support lands. + // image-to-video (image parts in the generateVideo prompt) similarly + // depends on aimock mocking Sora's `input_reference` upload field. + // Populate when aimock support lands. 'image-to-video': new Set([]), // Only Gemini currently surfaces a first-class stateful conversation API via // the adapter (geminiTextInteractions, behind @tanstack/ai-gemini/experimental). From ff3bb47168c42d49f2c79a3281e7c8402d60aead Mon Sep 17 00:00:00 2001 From: Tom Beckenham <34339192+tombeckenham@users.noreply.github.com> Date: Sun, 7 Jun 2026 12:09:22 +1000 Subject: [PATCH 07/11] fix: address PR review findings for image/video input support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - openai: add gpt-image-2 to the editImages error message and JSDoc (the model is edit-capable via EDIT_MAX_IMAGES but was omitted from user-facing guidance); same fix in docs, SKILL.md, and the changeset - openai: throw when the images.edit() response contains no usable images (matching grok's guard) instead of resolving to { images: [] } - openai: drop the unnecessary input_reference cast in the Sora adapter — the SDK types the field, so assign directly - fal: reject metadata.role 'mask'/'control' in the video mapper instead of silently folding them into source frames - docs: mark Veo role mappings as planned (no Veo adapter yet), note the Gemini ~14-image limit is provider-side, bump samples to gpt-image-2 - tests: cover the Gemini image-conditioned path (interleaved contents, fileData vs inlineData vs fetch+inline, Imagen/video/audio rejection), the Sora input_reference upload and guards (new file), the fal video createVideoJob field assembly and audio guard, and the openai empty-edit-response guard Co-Authored-By: Claude Opus 4.8 (1M context) --- .changeset/image-and-video-inputs.md | 2 +- docs/media/image-generation.md | 18 +- docs/media/video-generation.md | 8 +- packages/ai-fal/src/image/image-inputs.ts | 18 +- packages/ai-fal/tests/image-inputs.test.ts | 14 ++ packages/ai-fal/tests/video-adapter.test.ts | 85 +++++++++ .../ai-gemini/tests/image-adapter.test.ts | 174 ++++++++++++++++++ packages/ai-openai/src/adapters/image.ts | 15 +- packages/ai-openai/src/adapters/video.ts | 6 +- .../ai-openai/tests/image-adapter.test.ts | 26 +++ .../ai-openai/tests/video-adapter.test.ts | 114 ++++++++++++ .../skills/ai-core/media-generation/SKILL.md | 33 ++-- 12 files changed, 472 insertions(+), 41 deletions(-) create mode 100644 packages/ai-openai/tests/video-adapter.test.ts diff --git a/.changeset/image-and-video-inputs.md b/.changeset/image-and-video-inputs.md index da3e301e9..d1d5d51ec 100644 --- a/.changeset/image-and-video-inputs.md +++ b/.changeset/image-and-video-inputs.md @@ -14,7 +14,7 @@ Prompt text is always sent **verbatim** — the SDK never injects or rewrites in Provider behavior in this release: -- **OpenAI image** — Prompts with image parts route `gpt-image-1` / `gpt-image-1-mini` to `images.edit()` (up to 16 source images plus optional mask); `dall-e-2` routes to `images.edit()` with one source image; `dall-e-3` rejects image parts at compile time and at runtime. +- **OpenAI image** — Prompts with image parts route `gpt-image-2` / `gpt-image-1` / `gpt-image-1-mini` to `images.edit()` (up to 16 source images plus optional mask); `dall-e-2` routes to `images.edit()` with one source image; `dall-e-3` rejects image parts at compile time and at runtime. - **OpenAI video** — Sora-2 / Sora-2-Pro accept a single image part as `input_reference`; passing more than one throws. - **Gemini image** — Native models (`gemini-*-flash-image`, "nano-banana") map prompt parts 1:1 onto multimodal `contents`, preserving interleaved order. Imagen is text-only (compile-time + runtime rejection). - **fal.ai** — Field names resolve per endpoint from a map generated from the fal SDK's endpoint types (362 endpoints with nonstandard fields, e.g. nano-banana edit → `image_urls`, Kling i2v start frame → `image_url`, Veo first-last-frame → `first_frame_url` / `last_frame_url`). Defaults for endpoints not in the map: single → `image_url`, multiple → `image_urls`; `role: 'mask'` → `mask_url`; `role: 'control'` → `control_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`; video `role: 'start_frame'` / `'end_frame'` → `start_image_url` / `end_image_url`. Per-model prompt modalities are derived at the type level from the SDK's endpoint input types. Regenerate the map after a fal SDK bump with `pnpm generate:fal-image-fields` (a unit test fails when it goes stale). diff --git a/docs/media/image-generation.md b/docs/media/image-generation.md index 816503770..358e9bd9e 100644 --- a/docs/media/image-generation.md +++ b/docs/media/image-generation.md @@ -22,7 +22,7 @@ TanStack AI provides support for image generation through dedicated image adapte Image generation is handled by image adapters that follow the same tree-shakeable architecture as other adapters in TanStack AI. The image adapters support: -- **OpenAI**: DALL-E 2, DALL-E 3, GPT-Image-1, and GPT-Image-1-Mini models +- **OpenAI**: DALL-E 2, DALL-E 3, GPT-Image-1, GPT-Image-1-Mini, and GPT-Image-2 models - **Gemini**: Gemini native image models (NanoBanana) and Imagen 3/4 models - **fal.ai**: 600+ models including Nano Banana Pro, FLUX, and more @@ -141,7 +141,7 @@ import { generateImage } from '@tanstack/ai' import { openaiImage } from '@tanstack/ai-openai' await generateImage({ - adapter: openaiImage('gpt-image-1'), + adapter: openaiImage('gpt-image-2'), prompt: [ { type: 'text', content: 'Turn this into a cinematic product photo' }, { @@ -230,17 +230,17 @@ mapping. | Role | Maps to | | --------------- | -------------------------------------------------------------------------------------- | | `'reference'` | fal `reference_image_urls`; Gemini multimodal part; positional fallback | -| `'character'` | Same as `'reference'`; Veo `referenceImages` slot | -| `'mask'` | OpenAI `mask` (gpt-image-1, dall-e-2); fal `mask_url` | +| `'character'` | Same as `'reference'`; Veo `referenceImages` slot (planned — no Veo adapter yet) | +| `'mask'` | OpenAI `mask` (gpt-image-2, gpt-image-1, dall-e-2); fal `mask_url` | | `'control'` | fal `control_image_url` (ControlNet / depth / pose conditioning) | -| `'start_frame'` | fal `start_image_url`; Veo `image` (used by `generateVideo`) | -| `'end_frame'` | fal `end_image_url`; Veo `lastFrame` (used by `generateVideo`) | +| `'start_frame'` | fal `start_image_url`; Veo `image` (planned) (used by `generateVideo`) | +| `'end_frame'` | fal `end_image_url`; Veo `lastFrame` (planned) (used by `generateVideo`) | #### Inpaint / edit with a mask ```typescript await generateImage({ - adapter: openaiImage('gpt-image-1'), + adapter: openaiImage('gpt-image-2'), prompt: [ { type: 'text', content: 'Replace the masked region with a tree' }, { @@ -283,8 +283,8 @@ await generateImage({ | Provider | Behavior | | ------------ | --------------------------------------------------------------------------------------------------------- | -| **OpenAI** | `gpt-image-1` / `gpt-image-1-mini` → routes to `images.edit()`, up to 16 source images plus optional mask.
`dall-e-2` → `images.edit()` with 1 source image only.
`dall-e-3` → throws (no edit support). | -| **Gemini** | Native models (`gemini-*-flash-image`, "nano-banana", etc.) → prompt parts map 1:1 onto multimodal `contents`, preserving interleaved order. Up to ~14 input images.
Imagen models → throws (text-to-image only). | +| **OpenAI** | `gpt-image-2` / `gpt-image-1` / `gpt-image-1-mini` → routes to `images.edit()`, up to 16 source images plus optional mask.
`dall-e-2` → `images.edit()` with 1 source image only.
`dall-e-3` → throws (no edit support). | +| **Gemini** | Native models (`gemini-*-flash-image`, "nano-banana", etc.) → prompt parts map 1:1 onto multimodal `contents`, preserving interleaved order. Up to ~14 input images (provider limit, not enforced by the SDK).
Imagen models → throws (text-to-image only). | | **fal.ai** | Field names resolve per endpoint from a map generated from the fal SDK's endpoint types (e.g. nano-banana edit gets `image_urls`, Fooocus masks get `mask_image_url`). Defaults for unknown endpoints: 1 input → `image_url`; multiple → `image_urls`; `role: 'mask'` → `mask_url`; `role: 'control'` → `control_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`. Override with `modelOptions` for endpoint-specific fields. | | **Grok** | grok-imagine models → xAI's `/v1/images/edits` (up to 3 source images, addressed by xAI in request order; prompt sent verbatim). `role: 'mask'` / `'control'` throw (no Imagine API equivalent). `grok-2-image-1212` throws (text-to-image only). | | **OpenRouter** | Prompt parts map 1:1 onto multimodal `image_url` / `text` content parts, preserving interleaved order, and are forwarded to the underlying image model. | diff --git a/docs/media/video-generation.md b/docs/media/video-generation.md index e22056b04..513a3014f 100644 --- a/docs/media/video-generation.md +++ b/docs/media/video-generation.md @@ -417,10 +417,10 @@ adapter uses to route the input to the provider-specific field: | Role | Maps to | | --------------- | ------------------------------------------------------------- | -| `'start_frame'` | fal `start_image_url` (positional default for the first input) | -| `'end_frame'` | fal `end_image_url` (Veo `lastFrame` when available) | -| `'reference'` | fal `reference_image_urls` (Veo `referenceImages`) | -| `'character'` | Same as `'reference'` — character consistency images | +| `'start_frame'` | fal `start_image_url` (positional default for the first input) | +| `'end_frame'` | fal `end_image_url` (Veo `lastFrame` planned — no Veo adapter yet) | +| `'reference'` | fal `reference_image_urls` (Veo `referenceImages` planned) | +| `'character'` | Same as `'reference'` — character consistency images | ```typescript import { falVideo } from '@tanstack/ai-fal' diff --git a/packages/ai-fal/src/image/image-inputs.ts b/packages/ai-fal/src/image/image-inputs.ts index 377f9d3fd..6196627c6 100644 --- a/packages/ai-fal/src/image/image-inputs.ts +++ b/packages/ai-fal/src/image/image-inputs.ts @@ -189,7 +189,8 @@ export function mapImageInputsToFalFields( * - `metadata.role === 'start_frame'` → spec.start * - `metadata.role === 'end_frame'` → spec.end * - `metadata.role === 'reference' | 'character'` → spec.reference - * - remaining parts (any other / no role) → spec.single / spec.multi + * - `metadata.role === 'mask' | 'control'` → throws (no video routing) + * - remaining parts (no role) → spec.single / spec.multi */ export function mapImageInputsToFalVideoFields( model: TModel, @@ -200,8 +201,15 @@ export function mapImageInputsToFalVideoFields( const spec = fieldSpecFor(model) const { sources, masks, controls, references, starts, ends } = bucketByRole(imageInputs) - // Mask / control roles have no video-specific routing; treat as sources. - const allSources = [...sources, ...masks, ...controls] + // Mask / control roles have no video-specific routing; silently repurposing + // them as source frames would hide the problem, so reject them instead. + if (masks.length > 0 || controls.length > 0) { + const role = masks.length > 0 ? 'mask' : 'control' + throw new Error( + `fal: metadata.role === '${role}' is not supported for video generation on model ${model}. ` + + `Remove the role or pass the field explicitly via modelOptions.`, + ) + } if (starts.length > 1) { throw new Error( @@ -215,8 +223,8 @@ export function mapImageInputsToFalVideoFields( } const fields: Record = {} - const sourceField = allSources.length > 1 ? spec.multi : spec.single - assignField(fields, sourceField, allSources, model, 'source') + const sourceField = sources.length > 1 ? spec.multi : spec.single + assignField(fields, sourceField, sources, model, 'source') assignField(fields, spec.reference, references, model, 'reference') // Frame roles assign last: when an endpoint routes the start frame to its // generic source field (e.g. Kling image-to-video) and an unroled source diff --git a/packages/ai-fal/tests/image-inputs.test.ts b/packages/ai-fal/tests/image-inputs.test.ts index 2e5a13eea..ead3d76c2 100644 --- a/packages/ai-fal/tests/image-inputs.test.ts +++ b/packages/ai-fal/tests/image-inputs.test.ts @@ -185,6 +185,20 @@ describe('mapImageInputsToFalVideoFields', () => { }) }) + it('throws on mask/control roles instead of repurposing them as sources', () => { + expect(() => + mapImageInputsToFalVideoFields(UNKNOWN_MODEL, [ + urlPart('https://example.com/start.png'), + urlPart('https://example.com/mask.png', { role: 'mask' }), + ]), + ).toThrow(/'mask' is not supported for video generation/) + expect(() => + mapImageInputsToFalVideoFields(UNKNOWN_MODEL, [ + urlPart('https://example.com/depth.png', { role: 'control' }), + ]), + ).toThrow(/'control' is not supported for video generation/) + }) + describe('generated endpoint overrides', () => { it('routes role=start_frame to the source field on image-to-video endpoints', () => { // Kling i2v takes the start frame as plain image_url, the end frame diff --git a/packages/ai-fal/tests/video-adapter.test.ts b/packages/ai-fal/tests/video-adapter.test.ts index 7bf5ce466..2b6577bd8 100644 --- a/packages/ai-fal/tests/video-adapter.test.ts +++ b/packages/ai-fal/tests/video-adapter.test.ts @@ -1,5 +1,6 @@ import { beforeEach, describe, expect, it, vi } from 'vitest' import { generateVideo } from '@tanstack/ai' +import { resolveDebugOption } from '@tanstack/ai/adapter-internals' import { falVideo } from '../src/adapters/video' import { recordBillableUnitsFromResponse } from '../src/utils/billing' @@ -15,6 +16,8 @@ function seedBillableUnits(requestId: string, units: string) { ) } +const testLogger = resolveDebugOption(false) + // Declare mocks at module level let mockQueueSubmit: any let mockQueueStatus: any @@ -166,6 +169,88 @@ describe('Fal Video Adapter', () => { }) }) + describe('createVideoJob with a multimodal prompt', () => { + it('maps prompt media parts onto fal input fields', async () => { + mockQueueSubmit.mockResolvedValueOnce({ request_id: 'job-mm' }) + + const adapter = createAdapter() + + await adapter.createVideoJob({ + model: 'fal-ai/veo3/image-to-video', + prompt: [ + { type: 'text', content: 'Slow cinematic push-in' }, + { + type: 'image', + source: { type: 'url', value: 'https://example.com/start.png' }, + }, + { + type: 'video', + source: { type: 'url', value: 'https://example.com/ref.mp4' }, + metadata: { role: 'reference' }, + }, + { + type: 'audio', + source: { type: 'url', value: 'https://example.com/voice.mp3' }, + }, + ], + logger: testLogger, + }) + + const [, options] = mockQueueSubmit.mock.calls[0]! + expect(options.input).toEqual({ + prompt: 'Slow cinematic push-in', + image_url: 'https://example.com/start.png', + reference_video_urls: ['https://example.com/ref.mp4'], + audio_url: 'https://example.com/voice.mp3', + }) + }) + + it('omits the prompt field for media-only prompts', async () => { + mockQueueSubmit.mockResolvedValueOnce({ request_id: 'job-i2v' }) + + const adapter = createAdapter() + + await adapter.createVideoJob({ + model: 'fal-ai/veo3/image-to-video', + prompt: [ + { + type: 'image', + source: { type: 'url', value: 'https://example.com/start.png' }, + }, + ], + logger: testLogger, + }) + + const [, options] = mockQueueSubmit.mock.calls[0]! + expect(options.input).toEqual({ + image_url: 'https://example.com/start.png', + }) + }) + + it('throws when more than one audio prompt part is provided', async () => { + const adapter = createAdapter() + + await expect( + adapter.createVideoJob({ + model: 'fal-ai/veo3/image-to-video', + prompt: [ + { type: 'text', content: 'x' }, + { + type: 'audio', + source: { type: 'url', value: 'https://example.com/a.mp3' }, + }, + { + type: 'audio', + source: { type: 'url', value: 'https://example.com/b.mp3' }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/exactly one audio prompt part/) + expect(mockQueueSubmit).not.toHaveBeenCalled() + }) + }) + describe('getVideoStatus', () => { it('returns pending status for queued jobs', async () => { mockQueueStatus.mockResolvedValueOnce({ diff --git a/packages/ai-gemini/tests/image-adapter.test.ts b/packages/ai-gemini/tests/image-adapter.test.ts index c2b7db2b0..64d9b0a76 100644 --- a/packages/ai-gemini/tests/image-adapter.test.ts +++ b/packages/ai-gemini/tests/image-adapter.test.ts @@ -1,5 +1,6 @@ import { describe, it, expect, vi } from 'vitest' import { generateImage } from '@tanstack/ai' +import { resolveDebugOption } from '@tanstack/ai/adapter-internals' import { GeminiImageAdapter, createGeminiImage } from '../src/adapters/image' import { parseNativeImageSize, @@ -662,4 +663,177 @@ describe('Gemini Image Adapter', () => { }) }) }) + + describe('multimodal prompt (image-conditioned generation)', () => { + const testLogger = resolveDebugOption(false) + const mockImageResponse = { + candidates: [ + { + content: { + parts: [{ inlineData: { mimeType: 'image/png', data: 'out' } }], + }, + }, + ], + } + + function mockedNativeAdapter() { + const mockGenerateContent = vi + .fn() + .mockResolvedValueOnce(mockImageResponse) + const adapter = createGeminiImage( + 'gemini-3.1-flash-image-preview', + 'test-api-key', + ) + ;( + adapter as unknown as { + client: { models: { generateContent: unknown } } + } + ).client = { + models: { generateContent: mockGenerateContent }, + } + return { adapter, mockGenerateContent } + } + + it('maps interleaved prompt parts onto multimodal contents in order', async () => { + const { adapter, mockGenerateContent } = mockedNativeAdapter() + + await generateImage({ + adapter, + prompt: [ + { type: 'text', content: 'Not like this' }, + { + type: 'image', + source: { type: 'data', value: 'YmFk', mimeType: 'image/jpeg' }, + }, + { type: 'text', content: 'more like this' }, + { + type: 'image', + // Google Files API URIs pass through as fileData (no fetch). + source: { + type: 'url', + value: + 'https://generativelanguage.googleapis.com/v1beta/files/abc', + mimeType: 'image/png', + }, + }, + ], + }) + + expect(mockGenerateContent).toHaveBeenCalledWith({ + model: 'gemini-3.1-flash-image-preview', + contents: [ + { + role: 'user', + parts: [ + { text: 'Not like this' }, + { inlineData: { mimeType: 'image/jpeg', data: 'YmFk' } }, + { text: 'more like this' }, + { + fileData: { + fileUri: + 'https://generativelanguage.googleapis.com/v1beta/files/abc', + mimeType: 'image/png', + }, + }, + ], + }, + ], + config: { responseModalities: ['TEXT', 'IMAGE'] }, + }) + }) + + it('fetches arbitrary URL sources and inlines them as base64', async () => { + const { adapter, mockGenerateContent } = mockedNativeAdapter() + // 'hi' → base64 'aGk=' + const fetchMock = vi.fn().mockResolvedValue( + new Response(new Uint8Array([104, 105]), { + headers: { 'content-type': 'image/jpeg' }, + }), + ) + vi.stubGlobal('fetch', fetchMock) + + try { + await generateImage({ + adapter, + prompt: [ + { type: 'text', content: 'Edit this' }, + { + type: 'image', + source: { type: 'url', value: 'https://example.com/photo.jpg' }, + }, + ], + }) + } finally { + vi.unstubAllGlobals() + } + + expect(fetchMock).toHaveBeenCalledWith('https://example.com/photo.jpg') + const args = mockGenerateContent.mock.calls[0]![0] + expect(args.contents).toEqual([ + { + role: 'user', + parts: [ + { text: 'Edit this' }, + { inlineData: { mimeType: 'image/jpeg', data: 'aGk=' } }, + ], + }, + ]) + }) + + it('rejects image prompt parts for Imagen models', async () => { + const adapter = createGeminiImage( + 'imagen-4.0-generate-001', + 'test-api-key', + ) + + await expect( + adapter.generateImages({ + model: 'imagen-4.0-generate-001', + prompt: [ + { type: 'text', content: 'Edit this' }, + { + type: 'image', + source: { type: 'data', value: 'aGk=', mimeType: 'image/png' }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/does not support image prompt parts/) + }) + + it('rejects video and audio prompt parts', async () => { + const adapter = createGeminiImage( + 'gemini-3.1-flash-image-preview', + 'test-api-key', + ) + + await expect( + adapter.generateImages({ + model: 'gemini-3.1-flash-image-preview', + prompt: [ + { type: 'text', content: 'x' }, + { + type: 'video', + source: { type: 'url', value: 'https://example.com/v.mp4' }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/video prompt parts/) + + await expect( + adapter.generateImages({ + model: 'gemini-3.1-flash-image-preview', + prompt: [ + { type: 'text', content: 'x' }, + { + type: 'audio', + source: { type: 'url', value: 'https://example.com/a.mp3' }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/audio prompt parts/) + }) + }) }) diff --git a/packages/ai-openai/src/adapters/image.ts b/packages/ai-openai/src/adapters/image.ts index 0878b9258..ead8834c6 100644 --- a/packages/ai-openai/src/adapters/image.ts +++ b/packages/ai-openai/src/adapters/image.ts @@ -173,9 +173,10 @@ export class OpenAIImageAdapter< /** * Image-conditioned generation via OpenAI's `images.edit()` endpoint. - * dall-e-2 accepts 1 input image; gpt-image-1 / gpt-image-1-mini accept up - * to 16; dall-e-3 rejects entirely. A part with `metadata.role === 'mask'` - * is routed to the SDK's `mask` field (PNG with alpha channel). + * dall-e-2 accepts 1 input image; gpt-image-2 / gpt-image-1 / + * gpt-image-1-mini accept up to 16; dall-e-3 rejects entirely. A part with + * `metadata.role === 'mask'` is routed to the SDK's `mask` field (PNG with + * alpha channel). */ private async editImages(args: { model: OpenAIImageModel @@ -191,7 +192,7 @@ export class OpenAIImageAdapter< if (maxImages === 0) { throw new Error( `${this.name}: model "${model}" does not support image prompt parts. ` + - `Use gpt-image-1, gpt-image-1-mini, or dall-e-2 for image-conditioned generation.`, + `Use gpt-image-2, gpt-image-1, gpt-image-1-mini, or dall-e-2 for image-conditioned generation.`, ) } @@ -272,6 +273,12 @@ export class OpenAIImageAdapter< }, ) + // Surface empty responses (e.g. moderation blocks returning items with + // neither b64_json nor url) instead of resolving to `{ images: [] }`. + if (images.length === 0) { + throw new Error(`${this.name}: image edit response contained no images`) + } + return { id: generateId(this.name), model, diff --git a/packages/ai-openai/src/adapters/video.ts b/packages/ai-openai/src/adapters/video.ts index 47218502d..15bb2a733 100644 --- a/packages/ai-openai/src/adapters/video.ts +++ b/packages/ai-openai/src/adapters/video.ts @@ -122,8 +122,10 @@ export class OpenAIVideoAdapter< if (inputReference) { // Sora's `input_reference` is a single Uploadable; convert TanStack // ImagePart (URL or base64) → File before handing it to the SDK. - const file = await imagePartToFile(inputReference, 'input-reference') - ;(request as { input_reference?: unknown }).input_reference = file + request.input_reference = await imagePartToFile( + inputReference, + 'input-reference', + ) } // `VideoCreateParams.size` is `size?: VideoSize` (no `| undefined`), so we // narrow before assignment instead of casting from a `T | undefined` source. diff --git a/packages/ai-openai/tests/image-adapter.test.ts b/packages/ai-openai/tests/image-adapter.test.ts index d8ab53086..10b2578ad 100644 --- a/packages/ai-openai/tests/image-adapter.test.ts +++ b/packages/ai-openai/tests/image-adapter.test.ts @@ -360,6 +360,32 @@ describe('OpenAI Image Adapter', () => { expect(editArgs.image).toBeInstanceOf(File) }) + it('throws when the edit response contains no usable images', async () => { + const adapter = new TestOpenAIImageAdapter( + { apiKey: 'test-api-key' }, + 'gpt-image-1', + ) + // Items with neither b64_json nor url (e.g. moderation blocks) must + // surface as an error, not resolve to `{ images: [] }`. + adapter + .spyOnImagesEdit() + .mockResolvedValueOnce({ created: 0, data: [{}] }) + + await expect( + adapter.generateImages({ + model: 'gpt-image-1', + prompt: [ + { type: 'text', content: 'edit' }, + { + type: 'image', + source: { type: 'data', value: 'aGk=', mimeType: 'image/png' }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/image edit response contained no images/) + }) + it('rejects video or audio prompt parts', async () => { const adapter = new TestOpenAIImageAdapter( { apiKey: 'test-api-key' }, diff --git a/packages/ai-openai/tests/video-adapter.test.ts b/packages/ai-openai/tests/video-adapter.test.ts new file mode 100644 index 000000000..5d28241a8 --- /dev/null +++ b/packages/ai-openai/tests/video-adapter.test.ts @@ -0,0 +1,114 @@ +import { describe, expect, it, vi } from 'vitest' +import { resolveDebugOption } from '@tanstack/ai/adapter-internals' +import { OpenAIVideoAdapter, createOpenaiVideo } from '../src/adapters/video' + +const testLogger = resolveDebugOption(false) + +/** + * Replace the SDK's `videos` client with a mock. `createVideoJob` reaches the + * SDK exclusively through `getVideosClient()`, so swapping the `videos` + * resource is enough; the adapter's own request assembly stays real. + */ +function mockedAdapter() { + const adapter = createOpenaiVideo('sora-2', 'test-api-key') + const mockCreate = vi.fn().mockResolvedValue({ id: 'video-job-1' }) + ;(adapter as unknown as { client: { videos: unknown } }).client = { + videos: { create: mockCreate }, + } + return { adapter, mockCreate } +} + +describe('OpenAI Video Adapter', () => { + it('creates an adapter with the provided API key', () => { + const adapter = createOpenaiVideo('sora-2', 'test-api-key') + expect(adapter).toBeInstanceOf(OpenAIVideoAdapter) + expect(adapter.name).toBe('openai') + expect(adapter.model).toBe('sora-2') + }) + + describe('createVideoJob with a multimodal prompt', () => { + it('uploads a single image part as input_reference with verbatim prompt text', async () => { + const { adapter, mockCreate } = mockedAdapter() + + const result = await adapter.createVideoJob({ + model: 'sora-2', + prompt: [ + { type: 'text', content: 'Slow cinematic push-in' }, + { + type: 'image', + source: { type: 'data', value: 'aGk=', mimeType: 'image/png' }, + }, + ], + logger: testLogger, + }) + + expect(mockCreate).toHaveBeenCalledTimes(1) + const request = mockCreate.mock.calls[0]![0] + expect(request.model).toBe('sora-2') + expect(request.prompt).toBe('Slow cinematic push-in') + expect(request.input_reference).toBeInstanceOf(File) + expect(result.jobId).toBe('video-job-1') + expect(result.model).toBe('sora-2') + }) + + it('throws when more than one image part is provided', async () => { + const { adapter, mockCreate } = mockedAdapter() + + await expect( + adapter.createVideoJob({ + model: 'sora-2', + prompt: [ + { type: 'text', content: 'x' }, + { + type: 'image', + source: { type: 'data', value: 'aGk=', mimeType: 'image/png' }, + }, + { + type: 'image', + source: { + type: 'data', + value: 'YnllCg==', + mimeType: 'image/png', + }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/at most one input_reference image/) + expect(mockCreate).not.toHaveBeenCalled() + }) + + it('rejects video and audio prompt parts', async () => { + const { adapter, mockCreate } = mockedAdapter() + + await expect( + adapter.createVideoJob({ + model: 'sora-2', + prompt: [ + { type: 'text', content: 'x' }, + { + type: 'video', + source: { type: 'url', value: 'https://example.com/v.mp4' }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/video prompt parts/) + + await expect( + adapter.createVideoJob({ + model: 'sora-2', + prompt: [ + { type: 'text', content: 'x' }, + { + type: 'audio', + source: { type: 'url', value: 'https://example.com/a.mp3' }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/audio prompt parts/) + expect(mockCreate).not.toHaveBeenCalled() + }) + }) +}) diff --git a/packages/ai/skills/ai-core/media-generation/SKILL.md b/packages/ai/skills/ai-core/media-generation/SKILL.md index ed212e056..af9d80cf0 100644 --- a/packages/ai/skills/ai-core/media-generation/SKILL.md +++ b/packages/ai/skills/ai-core/media-generation/SKILL.md @@ -208,18 +208,19 @@ compile time via the adapter's input-modality map. import { generateImage } from '@tanstack/ai' import { openaiImage } from '@tanstack/ai-openai' -// Image-to-image (OpenAI gpt-image-1, dall-e-2) +// Image-to-image (OpenAI gpt-image-2 / gpt-image-1, dall-e-2) await generateImage({ - adapter: openaiImage('gpt-image-1'), + adapter: openaiImage('gpt-image-2'), prompt: [ { type: 'text', content: 'Turn this into a cinematic product photo' }, { type: 'image', source: { type: 'url', value: 'https://…/product.png' } }, ], }) -// Multi-reference (up to 16 for gpt-image-1; up to 14 for Gemini native) +// Multi-reference (up to 16 for gpt-image models; up to ~14 for Gemini native +// — a provider limit, not enforced by the SDK) await generateImage({ - adapter: openaiImage('gpt-image-1'), + adapter: openaiImage('gpt-image-2'), prompt: [ { type: 'text', content: 'Apply the second image as style to the first' }, { type: 'image', source: { type: 'url', value: 'https://…/product.png' } }, @@ -227,9 +228,9 @@ await generateImage({ ], }) -// Inpaint via metadata.role === 'mask' (OpenAI gpt-image-1, dall-e-2; fal mask_url) +// Inpaint via metadata.role === 'mask' (OpenAI gpt-image models, dall-e-2; fal mask_url) await generateImage({ - adapter: openaiImage('gpt-image-1'), + adapter: openaiImage('gpt-image-2'), prompt: [ { type: 'text', content: 'Replace the masked region with a tree' }, { type: 'image', source: { type: 'url', value: photoUrl } }, @@ -261,20 +262,20 @@ await generateVideo({ **Role hints** (`metadata.role`): -| Role | Maps to | -| --------------- | ------------------------------------------------------------------------------------------- | -| `'reference'` | fal `reference_image_urls`; Gemini multimodal part; positional otherwise | -| `'character'` | Same as `'reference'`; Veo `referenceImages` slot | -| `'mask'` | OpenAI `mask` (gpt-image-1, dall-e-2); fal `mask_url` | -| `'control'` | fal `control_image_url` (ControlNet / depth / pose) | -| `'start_frame'` | fal `start_image_url` (or the endpoint's field, e.g. `image_url` on Kling i2v); Veo `image` | -| `'end_frame'` | fal `end_image_url` (or e.g. `tail_image_url` / `last_frame_url`); Veo `lastFrame` | +| Role | Maps to | +| --------------- | ----------------------------------------------------------------------------------------------------- | +| `'reference'` | fal `reference_image_urls`; Gemini multimodal part; positional otherwise | +| `'character'` | Same as `'reference'`; Veo `referenceImages` slot (planned — no Veo adapter yet) | +| `'mask'` | OpenAI `mask` (gpt-image-2, gpt-image-1, dall-e-2); fal `mask_url` | +| `'control'` | fal `control_image_url` (ControlNet / depth / pose) | +| `'start_frame'` | fal `start_image_url` (or the endpoint's field, e.g. `image_url` on Kling i2v); Veo `image` (planned) | +| `'end_frame'` | fal `end_image_url` (or e.g. `tail_image_url` / `last_frame_url`); Veo `lastFrame` (planned) | **Provider support matrix:** | Provider | `generateImage` image parts | `generateVideo` image parts | | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| OpenAI | gpt-image-1 / -mini → `images.edit()` (up to 16). dall-e-2 → edit (1). dall-e-3 throws. | Sora-2 / -pro → `input_reference` (single). Throws if >1. | +| OpenAI | gpt-image-2 / gpt-image-1 / -mini → `images.edit()` (up to 16). dall-e-2 → edit (1). dall-e-3 throws. | Sora-2 / -pro → `input_reference` (single). Throws if >1. | | Gemini | Native (gemini-\*-flash-image, "nano-banana") → multimodal `contents`. Imagen throws. | No native Veo adapter yet — deferred to a follow-up. | | fal | Per-endpoint field names from a generated map (`pnpm generate:fal-image-fields`). Defaults: 1 input → `image_url`; >1 → `image_urls`; roles → `mask_url` / `control_image_url` / `reference_image_urls`. | Per-endpoint map (e.g. Kling i2v start frame → `image_url`). Defaults: 1 input → `image_url`; `start_frame`/`end_frame` → `start_image_url`/`end_image_url`; `reference` → `reference_image_urls`. | | Grok | grok-imagine models → `/v1/images/edits` JSON endpoint (≤3 sources, addressed by xAI in request order; prompt sent verbatim; mask/control throw). grok-2-image-1212 throws. | n/a | @@ -732,7 +733,7 @@ generateImage({ // CORRECT — use a model that supports image-conditioned generation generateImage({ - adapter: openaiImage('gpt-image-1'), // edits up to 16 images + adapter: openaiImage('gpt-image-2'), // edits up to 16 images prompt: [ { type: 'text', content: 'Edit this' }, { type: 'image', source: { type: 'url', value: url } }, From 32f2175ad0897845e9b8d1f76e5967fe9aa99e37 Mon Sep 17 00:00:00 2001 From: Tom Beckenham <34339192+tombeckenham@users.noreply.github.com> Date: Sun, 7 Jun 2026 17:08:51 +1000 Subject: [PATCH 08/11] fix(ai-openai): throw on empty generateImages responses too Same defect class as the editImages guard in the previous commit: the text-to-image path silently resolved to { images: [] } when response items had neither b64_json nor url. Surface it as an error instead. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/ai-openai/src/adapters/image.ts | 6 ++++++ packages/ai-openai/tests/image-adapter.test.ts | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/packages/ai-openai/src/adapters/image.ts b/packages/ai-openai/src/adapters/image.ts index ead8834c6..d11a67b67 100644 --- a/packages/ai-openai/src/adapters/image.ts +++ b/packages/ai-openai/src/adapters/image.ts @@ -150,6 +150,12 @@ export class OpenAIImageAdapter< }, ) + // Surface empty responses (e.g. moderation blocks returning items with + // neither b64_json nor url) instead of resolving to `{ images: [] }`. + if (images.length === 0) { + throw new Error(`${this.name}: image response contained no images`) + } + // `ImageGenerationResult.usage` is `usage?: TokenUsage` without // `| undefined`, so spread the field only when the model reported usage. const usage = buildImagesUsage(response.usage) diff --git a/packages/ai-openai/tests/image-adapter.test.ts b/packages/ai-openai/tests/image-adapter.test.ts index 10b2578ad..ffae82297 100644 --- a/packages/ai-openai/tests/image-adapter.test.ts +++ b/packages/ai-openai/tests/image-adapter.test.ts @@ -208,6 +208,24 @@ describe('OpenAI Image Adapter', () => { }) }) + it('throws when the response contains no usable images', async () => { + const adapter = new TestOpenAIImageAdapter( + { apiKey: 'test-api-key' }, + 'gpt-image-1', + ) + adapter + .spyOnImagesGenerate() + .mockResolvedValueOnce({ created: 0, data: [{}] }) + + await expect( + adapter.generateImages({ + model: 'gpt-image-1', + prompt: 'A cat', + logger: testLogger, + }), + ).rejects.toThrow(/image response contained no images/) + }) + it('generates a unique ID for each response', async () => { const mockResponse: OpenAI.Images.ImagesResponse = { created: 0, From 4b60a05f99e437a9918281dc1977c7645dd43074 Mon Sep 17 00:00:00 2001 From: Tom Beckenham <34339192+tombeckenham@users.noreply.github.com> Date: Sun, 7 Jun 2026 18:09:52 +1000 Subject: [PATCH 09/11] feat: client-side multimodal prompts, e2e coverage, media example, fal field demotion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ai-client: widen ImageGenerateInput.prompt / VideoGenerateInput.prompt from string to MediaPrompt so useGenerateImage/useGenerateVideo can carry image parts from the browser; re-export the MediaPrompt types from @tanstack/ai/client - ai-fal: demote media-conditioning fields (FalImageFieldName set plus video_url/video_urls/reference_video_urls/audio_url) from required to optional in FalImageProviderOptions / FalVideoProviderOptions — i2v endpoints declare e.g. image_url as required, but with a multimodal prompt the start frame arrives as a prompt part; modelOptions stays available as the explicit escape hatch - e2e: real coverage for image-to-image (OpenAI /v1/images/edits) and image-to-video (Sora multipart /v1/videos with input_reference) — the installed aimock 1.29 mocks both multipart endpoints, so the previous "aimock can't mock this" empty provider sets were stale. New specs run all three transports and assert via aimock's request journal that the expected wire endpoint was hit. ImageGenUI/VideoGenUI gain a file input, feature routing/fixtures/onVideo registration added, README matrix updated - examples/ts-react-media: ImageGenerator gains a multi-image reference picker (Gemini native models); VideoGenerator sends the start frame as a prompt part with role 'start_frame' instead of modelOptions URLs; server functions narrow the wire prompt per model and throw on unsupported part kinds instead of dropping them Co-Authored-By: Claude Opus 4.8 (1M context) --- .changeset/image-and-video-inputs.md | 5 +- docs/media/video-generation.md | 2 +- .../src/components/ImageGenerator.tsx | 86 +++++++++++- .../src/components/VideoGenerator.tsx | 27 ++-- examples/ts-react-media/src/lib/media.ts | 78 +++++++++++ .../src/lib/server-functions.ts | 130 ++++++++++++------ packages/ai-client/src/generation-types.ts | 18 ++- .../tests/video-generation-client.test.ts | 3 +- packages/ai-fal/src/model-meta.ts | 38 ++++- packages/ai-fal/tests/video-adapter.test.ts | 8 ++ packages/ai/src/client.ts | 4 + testing/e2e/README.md | 2 + .../e2e/fixtures/image-to-image/basic.json | 14 ++ testing/e2e/global-setup.ts | 13 ++ testing/e2e/src/components/ImageGenUI.tsx | 52 ++++++- testing/e2e/src/components/VideoGenUI.tsx | 51 ++++++- testing/e2e/src/lib/feature-support.ts | 26 ++-- testing/e2e/src/lib/features.ts | 8 ++ testing/e2e/src/lib/server-functions.ts | 17 ++- testing/e2e/src/routes/$provider/$feature.tsx | 22 +++ testing/e2e/src/routes/api.image.stream.ts | 3 +- testing/e2e/src/routes/api.image.ts | 3 +- testing/e2e/src/routes/api.video.stream.ts | 3 +- testing/e2e/src/routes/api.video.ts | 3 +- testing/e2e/tests/image-to-image.spec.ts | 105 ++++++++++++++ testing/e2e/tests/image-to-video.spec.ts | 103 ++++++++++++++ 26 files changed, 731 insertions(+), 93 deletions(-) create mode 100644 examples/ts-react-media/src/lib/media.ts create mode 100644 testing/e2e/fixtures/image-to-image/basic.json create mode 100644 testing/e2e/tests/image-to-image.spec.ts create mode 100644 testing/e2e/tests/image-to-video.spec.ts diff --git a/.changeset/image-and-video-inputs.md b/.changeset/image-and-video-inputs.md index d1d5d51ec..e6a324899 100644 --- a/.changeset/image-and-video-inputs.md +++ b/.changeset/image-and-video-inputs.md @@ -5,6 +5,7 @@ '@tanstack/ai-fal': minor '@tanstack/ai-grok': minor '@tanstack/ai-openrouter': minor +'@tanstack/ai-client': minor '@tanstack/ai-event-client': patch --- @@ -17,11 +18,13 @@ Provider behavior in this release: - **OpenAI image** — Prompts with image parts route `gpt-image-2` / `gpt-image-1` / `gpt-image-1-mini` to `images.edit()` (up to 16 source images plus optional mask); `dall-e-2` routes to `images.edit()` with one source image; `dall-e-3` rejects image parts at compile time and at runtime. - **OpenAI video** — Sora-2 / Sora-2-Pro accept a single image part as `input_reference`; passing more than one throws. - **Gemini image** — Native models (`gemini-*-flash-image`, "nano-banana") map prompt parts 1:1 onto multimodal `contents`, preserving interleaved order. Imagen is text-only (compile-time + runtime rejection). -- **fal.ai** — Field names resolve per endpoint from a map generated from the fal SDK's endpoint types (362 endpoints with nonstandard fields, e.g. nano-banana edit → `image_urls`, Kling i2v start frame → `image_url`, Veo first-last-frame → `first_frame_url` / `last_frame_url`). Defaults for endpoints not in the map: single → `image_url`, multiple → `image_urls`; `role: 'mask'` → `mask_url`; `role: 'control'` → `control_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`; video `role: 'start_frame'` / `'end_frame'` → `start_image_url` / `end_image_url`. Per-model prompt modalities are derived at the type level from the SDK's endpoint input types. Regenerate the map after a fal SDK bump with `pnpm generate:fal-image-fields` (a unit test fails when it goes stale). +- **fal.ai** — Field names resolve per endpoint from a map generated from the fal SDK's endpoint types (362 endpoints with nonstandard fields, e.g. nano-banana edit → `image_urls`, Kling i2v start frame → `image_url`, Veo first-last-frame → `first_frame_url` / `last_frame_url`). Defaults for endpoints not in the map: single → `image_url`, multiple → `image_urls`; `role: 'mask'` → `mask_url`; `role: 'control'` → `control_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`; video `role: 'start_frame'` / `'end_frame'` → `start_image_url` / `end_image_url`. Per-model prompt modalities are derived at the type level from the SDK's endpoint input types. Regenerate the map after a fal SDK bump with `pnpm generate:fal-image-fields` (a unit test fails when it goes stale). In `FalImageProviderOptions` / `FalVideoProviderOptions`, media-conditioning fields the mappers can populate (`image_url`, `start_image_url`, `video_url`, `audio_url`, …) are demoted from required to optional — supply them as prompt parts, or keep passing them explicitly via `modelOptions`. - **Grok** — New `grok-imagine-image` / `grok-imagine-image-quality` models. Prompts with image parts route to xAI's JSON `/v1/images/edits` endpoint (up to 3 source images, addressed by xAI in request order; the prompt is sent verbatim). `role: 'mask'` / `'control'` throw. Their `size` uses an `aspectRatio_resolution` template (`'16:9_2k'`, suffix optional) mirroring Gemini's native image models. `grok-2-image-1212` remains text-to-image only. - **OpenRouter** — Prompt parts map 1:1 onto multimodal `text` / `image_url` chat content parts, preserving interleaved order, and are forwarded to the underlying image model. URL sources pass through verbatim (no fetching or re-encoding in your process); `data` sources become data URIs. - **Anthropic** — Unchanged (no image generation API). A new `resolveMediaPrompt()` utility (exported from `@tanstack/ai`) is the single downrev point from the canonical interleaved prompt shape to flattened text + per-modality part buckets, for adapter authors. +On the client side, `ImageGenerateInput.prompt` and `VideoGenerateInput.prompt` (`@tanstack/ai-client`, and the `useGenerateImage` / `useGenerateVideo` hooks built on them) are widened from `string` to the same `MediaPrompt` shape, so prompt parts can be sent from the browser through your server route to `generateImage()` / `generateVideo()`. + Closes #618. diff --git a/docs/media/video-generation.md b/docs/media/video-generation.md index 513a3014f..dd175b3b6 100644 --- a/docs/media/video-generation.md +++ b/docs/media/video-generation.md @@ -444,7 +444,7 @@ await generateVideo({ | Provider | Image-to-Video Behavior | | ------------ | -------------------------------------------------------------------------------------------------------- | | **OpenAI** | Sora-2 / Sora-2-Pro → the image part goes to `input_reference`; flattened text is the prompt. Single image only — throws if more than one. | -| **fal.ai** | Field names resolve per endpoint from a map generated from the fal SDK's endpoint types — e.g. `role: 'start_frame'` lands on `image_url` for Kling/Veo image-to-video, `first_frame_url` for first-last-frame endpoints, and `start_image_url` otherwise. Defaults: single input → `image_url` (start frame); `role: 'end_frame'` → `end_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`. Override per-endpoint via `modelOptions`. | +| **fal.ai** | Field names resolve per endpoint from a map generated from the fal SDK's endpoint types — e.g. `role: 'start_frame'` lands on `image_url` for Kling/Veo image-to-video, `first_frame_url` for first-last-frame endpoints, and `start_image_url` otherwise. Defaults: single input → `image_url` (start frame); `role: 'end_frame'` → `end_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`. Override per-endpoint via `modelOptions` — the media-conditioning fields are typed optional there (even when the endpoint requires them) since they usually arrive as prompt parts. | | **Gemini** | Veo adapter not yet implemented — image prompt parts will be supported when Veo lands. | Adapters whose underlying API can't accept image inputs throw a clear diff --git a/examples/ts-react-media/src/components/ImageGenerator.tsx b/examples/ts-react-media/src/components/ImageGenerator.tsx index 3a7c5b0e2..be606dcf2 100644 --- a/examples/ts-react-media/src/components/ImageGenerator.tsx +++ b/examples/ts-react-media/src/components/ImageGenerator.tsx @@ -1,10 +1,13 @@ -import { useState } from 'react' -import { ImageIcon, Loader2, Shuffle } from 'lucide-react' +import { useRef, useState } from 'react' +import { ImageIcon, Loader2, Plus, Shuffle, X } from 'lucide-react' import type { ImageGenerationResult } from '@tanstack/ai' +import type { MediaPrompt } from '@tanstack/ai/client' import { generateImageFn } from '@/lib/server-functions' import { getRandomImagePrompt } from '@/lib/prompts' import { IMAGE_MODELS } from '@/lib/models' +import { readImageFile, toImagePart } from '@/lib/media' +import type { AttachedImage } from '@/lib/media' interface ImageGeneratorProps { onImageGenerated?: (imageUrl: string) => void @@ -32,11 +35,37 @@ export default function ImageGenerator({ const [selectedModel, setSelectedModel] = useState('all') const [isLoading, setIsLoading] = useState(false) const [results, setResults] = useState>({}) + const [images, setImages] = useState>([]) + const fileInputRef = useRef(null) const currentModel = IMAGE_MODELS.find((m) => m.id === selectedModel) + // When images are attached, send an ordered parts array (text first, then one + // image part per attachment). Otherwise send the plain string. Only image-capable + // models accept image inputs — unsupported models surface a server error. + const buildPrompt = (): MediaPrompt => { + if (images.length === 0) return prompt + return [ + { type: 'text', content: prompt }, + ...images.map((image) => toImagePart(image)), + ] + } + + const handleImageSelect = async (e: React.ChangeEvent) => { + const files = Array.from(e.target.files ?? []) + if (fileInputRef.current) fileInputRef.current.value = '' + if (files.length === 0) return + const attached = await Promise.all(files.map((file) => readImageFile(file))) + setImages((prev) => [...prev, ...attached]) + } + + const removeImage = (id: string) => { + setImages((prev) => prev.filter((image) => image.id !== id)) + } + const handleGenerate = async () => { if (!prompt.trim()) return + const builtPrompt = buildPrompt() setIsLoading(true) setResults({}) @@ -53,7 +82,7 @@ export default function ImageGenerator({ const promises = IMAGE_MODELS.map(async (model) => { try { const response = await generateImageFn({ - data: { prompt, model: model.id }, + data: { prompt: builtPrompt, model: model.id }, }) setResults((prev) => ({ ...prev, @@ -83,7 +112,7 @@ export default function ImageGenerator({ try { const response = await generateImageFn({ - data: { prompt, model: selectedModel }, + data: { prompt: builtPrompt, model: selectedModel }, }) setResults({ [selectedModel]: { status: 'success', result: response } }) const image = response.images[0] @@ -162,6 +191,55 @@ export default function ImageGenerator({ /> +
+
+ + + Supported by Gemini native (NanoBanana) models only + +
+
+ {images.map((image) => ( +
+ {image.name} + +
+ ))} + +
+ +
+ + {withImageInput && ( + setImageFile(e.target.files?.[0] ?? null)} + className="text-sm text-gray-400" + /> + )}
{status === 'idle' ? 'idle' diff --git a/testing/e2e/src/components/VideoGenUI.tsx b/testing/e2e/src/components/VideoGenUI.tsx index be5be8923..85f94ee83 100644 --- a/testing/e2e/src/components/VideoGenUI.tsx +++ b/testing/e2e/src/components/VideoGenUI.tsx @@ -5,6 +5,7 @@ import { fetchHttpStream, } from '@tanstack/ai-react' import { generateVideoFn } from '@/lib/server-functions' +import type { MediaPrompt } from '@tanstack/ai' import type { Mode, Provider } from '@/lib/types' import type { VideoGenerateResult } from '@tanstack/ai-client' @@ -13,6 +14,24 @@ interface VideoGenUIProps { mode: Mode testId?: string aimockPort?: number + /** Show a file input and send the prompt as multimodal parts (image-to-video). */ + withImageInput?: boolean +} + +function fileToBase64(file: File): Promise { + return new Promise((resolve, reject) => { + const reader = new FileReader() + reader.onload = () => { + const result = reader.result + if (typeof result !== 'string') { + reject(new Error('Unexpected FileReader result')) + return + } + resolve(result.split(',')[1] ?? '') + } + reader.onerror = () => reject(new Error('Failed to read file')) + reader.readAsDataURL(file) + }) } export function VideoGenUI({ @@ -20,8 +39,10 @@ export function VideoGenUI({ mode, testId, aimockPort, + withImageInput, }: VideoGenUIProps) { const [prompt, setPrompt] = useState('') + const [imageFile, setImageFile] = useState(null) const connectionOptions = () => { const body = { provider, testId, aimockPort } @@ -33,7 +54,7 @@ export function VideoGenUI({ return { connection: fetchHttpStream('/api/video/stream'), body } } return { - fetcher: async (input: { prompt: string }) => { + fetcher: async (input: { prompt: MediaPrompt }) => { return generateVideoFn({ data: { prompt: input.prompt, provider, aimockPort, testId }, }) as Promise @@ -44,6 +65,23 @@ export function VideoGenUI({ const { generate, result, videoStatus, isLoading, error, status } = useGenerateVideo(connectionOptions()) + const handleGenerate = async () => { + if (!imageFile) { + await generate({ prompt }) + return + } + const base64 = await fileToBase64(imageFile) + await generate({ + prompt: [ + { type: 'text', content: prompt }, + { + type: 'image', + source: { type: 'data', value: base64, mimeType: imageFile.type }, + }, + ], + }) + } + return (
@@ -57,13 +95,22 @@ export function VideoGenUI({ />
+ {withImageInput && ( + setImageFile(e.target.files?.[0] ?? null)} + className="text-sm text-gray-400" + /> + )}
{status === 'idle' ? 'idle' diff --git a/testing/e2e/src/lib/feature-support.ts b/testing/e2e/src/lib/feature-support.ts index 54ca2990f..b4e85a715 100644 --- a/testing/e2e/src/lib/feature-support.ts +++ b/testing/e2e/src/lib/feature-support.ts @@ -178,22 +178,26 @@ export const matrix: Record> = { ]), // Gemini excluded: aimock doesn't mock Gemini's Imagen predict endpoint format 'image-gen': new Set(['openai', 'grok']), - // image-to-image (image parts in the generateImage prompt) routes adapters - // to wire endpoints aimock doesn't yet mock (OpenAI `/v1/images/edits`, - // Gemini multimodal `generateContent`, xAI `/v1/images/edits`, OpenRouter - // multimodal chat content parts, fal endpoint-specific input fields). - // Adapter-level mapping is covered by unit tests. Populate this set when - // aimock gains support for those endpoints. - 'image-to-image': new Set([]), + // image-to-image (image parts in the generateImage prompt). aimock 1.29 + // mocks OpenAI's multipart `/v1/images/edits` (matches on the `prompt` form + // field, ignores the binary image/mask fields), so the OpenAI route runs + // end-to-end. Other providers route to endpoints aimock doesn't mock yet + // (Gemini multimodal `generateContent`, xAI's JSON `/v1/images/edits`, + // OpenRouter multimodal chat content parts, fal endpoint-specific input + // fields) — their mapping is covered by unit tests. Add them here when + // aimock support lands. + 'image-to-image': new Set(['openai']), 'audio-gen': new Set(['gemini', 'elevenlabs']), 'sound-effects': new Set(['elevenlabs']), tts: new Set(['openai', 'grok', 'elevenlabs']), transcription: new Set(['openai', 'grok', 'elevenlabs']), 'video-gen': new Set(['openai']), - // image-to-video (image parts in the generateVideo prompt) similarly - // depends on aimock mocking Sora's `input_reference` upload field. - // Populate when aimock support lands. - 'image-to-video': new Set([]), + // image-to-video (image parts in the generateVideo prompt). aimock 1.29's + // `/v1/videos` handler parses Sora's multipart upload (the SDK switches to + // multipart when `input_reference` carries a File) and matches on the + // `prompt` form field, so the OpenAI/Sora route runs end-to-end. fal's + // endpoint-specific fields remain unit-test-only. + 'image-to-video': new Set(['openai']), // Only Gemini currently surfaces a first-class stateful conversation API via // the adapter (geminiTextInteractions, behind @tanstack/ai-gemini/experimental). 'stateful-interactions': new Set(['gemini']), diff --git a/testing/e2e/src/lib/features.ts b/testing/e2e/src/lib/features.ts index 446859ce6..032720468 100644 --- a/testing/e2e/src/lib/features.ts +++ b/testing/e2e/src/lib/features.ts @@ -103,6 +103,10 @@ export const featureConfigs: Record = { tools: [], modelOptions: {}, }, + 'image-to-image': { + tools: [], + modelOptions: {}, + }, 'audio-gen': { tools: [], modelOptions: {}, @@ -123,6 +127,10 @@ export const featureConfigs: Record = { tools: [], modelOptions: {}, }, + 'image-to-video': { + tools: [], + modelOptions: {}, + }, 'stateful-interactions': { tools: [], modelOptions: {}, diff --git a/testing/e2e/src/lib/server-functions.ts b/testing/e2e/src/lib/server-functions.ts index 03132c193..20faeb7b4 100644 --- a/testing/e2e/src/lib/server-functions.ts +++ b/testing/e2e/src/lib/server-functions.ts @@ -7,6 +7,7 @@ import { generateVideo, getVideoJobStatus, } from '@tanstack/ai' +import type { MediaPrompt } from '@tanstack/ai' import type { Feature, Provider } from '@/lib/types' import { createAudioAdapter, @@ -19,13 +20,17 @@ import { export const generateImageFn = createServerFn({ method: 'POST' }) .inputValidator( (data: { - prompt: string + prompt: MediaPrompt provider: Provider numberOfImages?: number aimockPort?: number testId?: string }) => { - if (!data.prompt.trim()) throw new Error('Prompt is required') + const isEmpty = + typeof data.prompt === 'string' + ? !data.prompt.trim() + : data.prompt.length === 0 + if (isEmpty) throw new Error('Prompt is required') if (!data.provider) throw new Error('Provider is required') return data }, @@ -133,12 +138,16 @@ export const generateAudioFn = createServerFn({ method: 'POST' }) export const generateVideoFn = createServerFn({ method: 'POST' }) .inputValidator( (data: { - prompt: string + prompt: MediaPrompt provider: Provider aimockPort?: number testId?: string }) => { - if (!data.prompt.trim()) throw new Error('Prompt is required') + const isEmpty = + typeof data.prompt === 'string' + ? !data.prompt.trim() + : data.prompt.length === 0 + if (isEmpty) throw new Error('Prompt is required') if (!data.provider) throw new Error('Provider is required') return data }, diff --git a/testing/e2e/src/routes/$provider/$feature.tsx b/testing/e2e/src/routes/$provider/$feature.tsx index ea080c4fc..b1fe5b40f 100644 --- a/testing/e2e/src/routes/$provider/$feature.tsx +++ b/testing/e2e/src/routes/$provider/$feature.tsx @@ -42,9 +42,11 @@ export const Route = createFileRoute('/$provider/$feature')({ const MEDIA_FEATURES = new Set([ 'image-gen', + 'image-to-image', 'tts', 'transcription', 'video-gen', + 'image-to-video', 'audio-gen', 'sound-effects', ]) @@ -132,6 +134,16 @@ function MediaFeature({ aimockPort={aimockPort} /> ) + case 'image-to-image': + return ( + + ) case 'tts': return ( ) + case 'image-to-video': + return ( + + ) case 'audio-gen': case 'sound-effects': return ( diff --git a/testing/e2e/src/routes/api.image.stream.ts b/testing/e2e/src/routes/api.image.stream.ts index abcf2c280..bd65b9756 100644 --- a/testing/e2e/src/routes/api.image.stream.ts +++ b/testing/e2e/src/routes/api.image.stream.ts @@ -1,6 +1,7 @@ import { createFileRoute } from '@tanstack/react-router' import { generateImage, toHttpResponse } from '@tanstack/ai' import { createImageAdapter } from '@/lib/media-providers' +import type { MediaPrompt } from '@tanstack/ai' import type { Provider } from '@/lib/types' export const Route = createFileRoute('/api/image/stream')({ @@ -13,7 +14,7 @@ export const Route = createFileRoute('/api/image/stream')({ const data = body.forwardedProps ?? body.data ?? body const { prompt, provider, numberOfImages, testId, aimockPort } = data as { - prompt: string + prompt: MediaPrompt provider: Provider numberOfImages?: number testId?: string diff --git a/testing/e2e/src/routes/api.image.ts b/testing/e2e/src/routes/api.image.ts index 8fb9829ac..d8b455a63 100644 --- a/testing/e2e/src/routes/api.image.ts +++ b/testing/e2e/src/routes/api.image.ts @@ -1,6 +1,7 @@ import { createFileRoute } from '@tanstack/react-router' import { generateImage, toServerSentEventsResponse } from '@tanstack/ai' import { createImageAdapter } from '@/lib/media-providers' +import type { MediaPrompt } from '@tanstack/ai' import type { Provider } from '@/lib/types' export const Route = createFileRoute('/api/image')({ @@ -13,7 +14,7 @@ export const Route = createFileRoute('/api/image')({ const data = body.forwardedProps ?? body.data ?? body const { prompt, provider, numberOfImages, testId, aimockPort } = data as { - prompt: string + prompt: MediaPrompt provider: Provider numberOfImages?: number testId?: string diff --git a/testing/e2e/src/routes/api.video.stream.ts b/testing/e2e/src/routes/api.video.stream.ts index 33643bd02..88eb1a189 100644 --- a/testing/e2e/src/routes/api.video.stream.ts +++ b/testing/e2e/src/routes/api.video.stream.ts @@ -1,6 +1,7 @@ import { createFileRoute } from '@tanstack/react-router' import { generateVideo, toHttpResponse } from '@tanstack/ai' import { createVideoAdapter } from '@/lib/media-providers' +import type { MediaPrompt } from '@tanstack/ai' import type { Provider } from '@/lib/types' export const Route = createFileRoute('/api/video/stream')({ @@ -12,7 +13,7 @@ export const Route = createFileRoute('/api/video/stream')({ const body = await request.json() const data = body.forwardedProps ?? body.data ?? body const { prompt, provider, testId, aimockPort } = data as { - prompt: string + prompt: MediaPrompt provider: Provider testId?: string aimockPort?: number diff --git a/testing/e2e/src/routes/api.video.ts b/testing/e2e/src/routes/api.video.ts index e50d9cb87..a9b0903ec 100644 --- a/testing/e2e/src/routes/api.video.ts +++ b/testing/e2e/src/routes/api.video.ts @@ -1,6 +1,7 @@ import { createFileRoute } from '@tanstack/react-router' import { generateVideo, toServerSentEventsResponse } from '@tanstack/ai' import { createVideoAdapter } from '@/lib/media-providers' +import type { MediaPrompt } from '@tanstack/ai' import type { Provider } from '@/lib/types' export const Route = createFileRoute('/api/video')({ @@ -12,7 +13,7 @@ export const Route = createFileRoute('/api/video')({ const body = await request.json() const data = body.forwardedProps ?? body.data ?? body const { prompt, provider, testId, aimockPort } = data as { - prompt: string + prompt: MediaPrompt provider: Provider testId?: string aimockPort?: number diff --git a/testing/e2e/tests/image-to-image.spec.ts b/testing/e2e/tests/image-to-image.spec.ts new file mode 100644 index 000000000..e4608d6af --- /dev/null +++ b/testing/e2e/tests/image-to-image.spec.ts @@ -0,0 +1,105 @@ +import path from 'path' +import { fileURLToPath } from 'url' +import { test, expect } from './fixtures' +import { + fillPrompt, + clickGenerate, + waitForGenerationComplete, + featureUrl, +} from './helpers' +import { providersFor } from './test-matrix' + +const __filename = fileURLToPath(import.meta.url) +const __dirname = path.dirname(__filename) +const testImagePath = path.resolve(__dirname, '../test-assets/guitar-shop.png') + +// Image-conditioned generation: the prompt is sent as multimodal parts +// (text + attached image). For OpenAI this routes generateImage() to the +// multipart /v1/images/edits endpoint instead of /v1/images/generations, +// exercising the imagePartToFile upload path end-to-end. +for (const provider of providersFor('image-to-image')) { + test.describe(`${provider} -- image-to-image`, () => { + test('sse -- edits an image via SSE connection', async ({ + page, + request, + testId, + aimockPort, + }) => { + await page.goto( + featureUrl(provider, 'image-to-image', testId, aimockPort, 'sse'), + ) + // Cold vite compiles of this route can delay hydration past fillPrompt's + // fallback; wait for the page to settle before interacting. + await page.waitForLoadState('networkidle') + await fillPrompt(page, 'add a tree to this product photo') + await page.getByTestId('image-input').setInputFiles(testImagePath) + await clickGenerate(page) + await waitForGenerationComplete(page) + const images = page.getByTestId('generated-image') + await expect(images).toHaveCount(1) + + // The fixture matches on prompt text regardless of endpoint, so also + // prove the adapter routed to the multipart edits endpoint (and didn't + // silently drop the image part and call /v1/images/generations). + const journalRes = await request.get( + `http://127.0.0.1:${aimockPort}/v1/_requests`, + ) + const entries = (await journalRes.json()) as Array<{ + path?: string + body?: unknown + }> + const editEntry = entries.find( + (e) => + e.path === '/v1/images/edits' && + JSON.stringify(e.body ?? '').includes( + 'add a tree to this product photo', + ), + ) + expect(editEntry).toBeTruthy() + }) + + test('http-stream -- edits an image via HTTP stream', async ({ + page, + testId, + aimockPort, + }) => { + await page.goto( + featureUrl( + provider, + 'image-to-image', + testId, + aimockPort, + 'http-stream', + ), + ) + // Cold vite compiles of this route can delay hydration past fillPrompt's + // fallback; wait for the page to settle before interacting. + await page.waitForLoadState('networkidle') + await fillPrompt(page, 'add a tree to this product photo') + await page.getByTestId('image-input').setInputFiles(testImagePath) + await clickGenerate(page) + await waitForGenerationComplete(page) + const images = page.getByTestId('generated-image') + await expect(images).toHaveCount(1) + }) + + test('fetcher -- edits an image via server function', async ({ + page, + testId, + aimockPort, + }) => { + await page.goto( + featureUrl(provider, 'image-to-image', testId, aimockPort, 'fetcher'), + ) + // Cold vite compiles of this route can delay hydration past fillPrompt's + // fallback; wait for the page to settle before interacting. + await page.waitForLoadState('networkidle') + await fillPrompt(page, 'add a tree to this product photo') + await page.getByTestId('image-input').setInputFiles(testImagePath) + await clickGenerate(page) + await waitForGenerationComplete(page) + const images = page.getByTestId('generated-image') + await expect(images).toHaveCount(1) + }) + }) +} diff --git a/testing/e2e/tests/image-to-video.spec.ts b/testing/e2e/tests/image-to-video.spec.ts new file mode 100644 index 000000000..951067953 --- /dev/null +++ b/testing/e2e/tests/image-to-video.spec.ts @@ -0,0 +1,103 @@ +import path from 'path' +import { fileURLToPath } from 'url' +import { test, expect } from './fixtures' +import { + fillPrompt, + clickGenerate, + waitForGenerationComplete, + featureUrl, +} from './helpers' +import { providersFor } from './test-matrix' + +const __filename = fileURLToPath(import.meta.url) +const __dirname = path.dirname(__filename) +const testImagePath = path.resolve(__dirname, '../test-assets/guitar-shop.png') + +// Image-to-video: the prompt is sent as multimodal parts (text + attached +// image). For OpenAI/Sora the image part is uploaded as `input_reference`, +// which switches the SDK to a multipart POST /v1/videos — exercising the +// imagePartToFile conversion and job polling flow end-to-end. +for (const provider of providersFor('image-to-video')) { + test.describe(`${provider} -- image-to-video`, () => { + test('sse -- animates an image via SSE connection', async ({ + page, + request, + testId, + aimockPort, + }) => { + await page.goto( + featureUrl(provider, 'image-to-video', testId, aimockPort, 'sse'), + ) + // Cold vite compiles of this route can delay hydration past fillPrompt's + // fallback; wait for the page to settle before interacting. + await page.waitForLoadState('networkidle') + await fillPrompt(page, 'animate this product photo') + await page.getByTestId('image-input').setInputFiles(testImagePath) + await clickGenerate(page) + await waitForGenerationComplete(page, 60_000) + const video = page.getByTestId('generated-video') + await expect(video).toBeVisible() + + // Prove the multipart POST /v1/videos round-tripped with the prompt + // text intact — the SDK switches to multipart when `input_reference` + // carries a File, and aimock extracts `prompt` from the form data. + const journalRes = await request.get( + `http://127.0.0.1:${aimockPort}/v1/_requests`, + ) + const entries = (await journalRes.json()) as Array<{ + path?: string + body?: unknown + }> + const videoEntry = entries.find( + (e) => + e.path === '/v1/videos' && + JSON.stringify(e.body ?? '').includes('animate this product photo'), + ) + expect(videoEntry).toBeTruthy() + }) + + test('http-stream -- animates an image via HTTP stream', async ({ + page, + testId, + aimockPort, + }) => { + await page.goto( + featureUrl( + provider, + 'image-to-video', + testId, + aimockPort, + 'http-stream', + ), + ) + // Cold vite compiles of this route can delay hydration past fillPrompt's + // fallback; wait for the page to settle before interacting. + await page.waitForLoadState('networkidle') + await fillPrompt(page, 'animate this product photo') + await page.getByTestId('image-input').setInputFiles(testImagePath) + await clickGenerate(page) + await waitForGenerationComplete(page, 60_000) + const video = page.getByTestId('generated-video') + await expect(video).toBeVisible() + }) + + test('fetcher -- animates an image via server function', async ({ + page, + testId, + aimockPort, + }) => { + await page.goto( + featureUrl(provider, 'image-to-video', testId, aimockPort, 'fetcher'), + ) + // Cold vite compiles of this route can delay hydration past fillPrompt's + // fallback; wait for the page to settle before interacting. + await page.waitForLoadState('networkidle') + await fillPrompt(page, 'animate this product photo') + await page.getByTestId('image-input').setInputFiles(testImagePath) + await clickGenerate(page) + await waitForGenerationComplete(page, 60_000) + const video = page.getByTestId('generated-video') + await expect(video).toBeVisible() + }) + }) +} From acd73197fc99d04fecda0cae39133f465a4ce75e Mon Sep 17 00:00:00 2001 From: Tom Beckenham <34339192+tombeckenham@users.noreply.github.com> Date: Wed, 10 Jun 2026 21:28:59 +1000 Subject: [PATCH 10/11] fix: address CodeRabbit review findings - fal image/video: spread modelOptions after derived media fields so explicit user overrides win (matches documented intent) - openai video: validate effective size (size ?? modelOptions.size) - generate-fal-image-field-map: run arity check for default-selected fields too - ts-react-media example: correct reference-image support comment (Gemini multimodal models, not NanoBanana) - e2e VideoGenUI: reject on malformed data URL instead of resolving '' Co-Authored-By: Claude Opus 4.8 (1M context) --- .../ts-react-media/src/components/ImageGenerator.tsx | 3 ++- packages/ai-fal/src/adapters/image.ts | 9 +++++---- packages/ai-fal/src/adapters/video.ts | 4 +++- packages/ai-openai/src/adapters/video.ts | 9 ++++----- scripts/generate-fal-image-field-map.ts | 7 +++++-- testing/e2e/src/components/VideoGenUI.tsx | 7 ++++++- 6 files changed, 25 insertions(+), 14 deletions(-) diff --git a/examples/ts-react-media/src/components/ImageGenerator.tsx b/examples/ts-react-media/src/components/ImageGenerator.tsx index be606dcf2..ca72e3823 100644 --- a/examples/ts-react-media/src/components/ImageGenerator.tsx +++ b/examples/ts-react-media/src/components/ImageGenerator.tsx @@ -197,7 +197,8 @@ export default function ImageGenerator({ Reference Images - Supported by Gemini native (NanoBanana) models only + Supported by Gemini multimodal models only + (gemini-3.1-flash-image-preview, gemini-3-pro-image-preview)
diff --git a/packages/ai-fal/src/adapters/image.ts b/packages/ai-fal/src/adapters/image.ts index b4327d40f..4595e4db3 100644 --- a/packages/ai-fal/src/adapters/image.ts +++ b/packages/ai-fal/src/adapters/image.ts @@ -107,14 +107,15 @@ export class FalImageAdapter extends BaseImageAdapter< resolved: ResolvedMediaPrompt, ): FalModelInput { const sizeParams = mapSizeToFalFormat(options.size) - // Order matters: modelOptions first (so user overrides win for - // mask_url / control_image_url / reference_image_urls), then size, - // then derived image-input fields, then prompt / num_images. + // Order matters: size and derived image-input fields first, then + // modelOptions (so explicit user overrides win for mask_url / + // control_image_url / reference_image_urls), then the call-controlled + // prompt / num_images, which always take precedence. const inputFields = mapImageInputsToFalFields(this.model, resolved.images) const input = { - ...options.modelOptions, ...sizeParams, ...inputFields, + ...options.modelOptions, // Media-only prompts (e.g. upscalers, background removal) omit the // prompt field entirely rather than sending an empty string. ...(resolved.text ? { prompt: resolved.text } : {}), diff --git a/packages/ai-fal/src/adapters/video.ts b/packages/ai-fal/src/adapters/video.ts index 7d8e424be..5ea056f82 100644 --- a/packages/ai-fal/src/adapters/video.ts +++ b/packages/ai-fal/src/adapters/video.ts @@ -167,11 +167,13 @@ export class FalVideoAdapter extends BaseVideoAdapter< const audioFields = mapAudioInputsToFalFields(resolved.audios) const input = { - ...modelOptions, ...sizeParams, ...inputImageFields, ...videoFields, ...audioFields, + // modelOptions applied after derived media fields so explicit user + // overrides (video_url, reference_video_urls, audio_url, ...) win. + ...modelOptions, // Media-only prompts omit the prompt field rather than sending an // empty string (e.g. pure image-to-video endpoints). ...(resolved.text ? { prompt: resolved.text } : {}), diff --git a/packages/ai-openai/src/adapters/video.ts b/packages/ai-openai/src/adapters/video.ts index 15bb2a733..e28a34be5 100644 --- a/packages/ai-openai/src/adapters/video.ts +++ b/packages/ai-openai/src/adapters/video.ts @@ -92,7 +92,8 @@ export class OpenAIVideoAdapter< ): Promise { const { model, size, duration, modelOptions } = options - validateVideoSize(model, size) + const resolvedSize = size ?? modelOptions?.size + validateVideoSize(model, resolvedSize) const seconds = duration ?? modelOptions?.seconds validateVideoSeconds(model, seconds) @@ -129,10 +130,8 @@ export class OpenAIVideoAdapter< } // `VideoCreateParams.size` is `size?: VideoSize` (no `| undefined`), so we // narrow before assignment instead of casting from a `T | undefined` source. - if (size) { - request.size = size - } else if (modelOptions?.size) { - request.size = modelOptions.size + if (resolvedSize) { + request.size = resolvedSize } if (seconds !== undefined) { // `toApiSeconds` returns `OpenAIVideoSeconds | undefined`; we already diff --git a/scripts/generate-fal-image-field-map.ts b/scripts/generate-fal-image-field-map.ts index 297819fe8..2743d1eba 100644 --- a/scripts/generate-fal-image-field-map.ts +++ b/scripts/generate-fal-image-field-map.ts @@ -221,10 +221,12 @@ function computeOverrides( for (const role of ROLE_ORDER) { if (VIDEO_ONLY_ROLES.has(role) && !producesVideo) continue const chosen = CANDIDATES[role].find((candidate) => fields.has(candidate)) - if (!chosen || chosen === DEFAULTS[role]) continue + if (!chosen) continue // Arity sanity check: the runtime mapper decides array-wrapping from - // the static LIST_FIELDS set, so the actual type must agree. + // the static LIST_FIELDS set, so the actual type must agree. Run this + // for default-selected fields too, otherwise a default field's type + // could drift silently. const actualIsList = isList.get(chosen) ?? false const assumedIsList = LIST_FIELDS.has(chosen) if (actualIsList !== assumedIsList) { @@ -235,6 +237,7 @@ function computeOverrides( `and LIST_FIELDS in image-inputs.ts.`, ) } + if (chosen === DEFAULTS[role]) continue entry[role] = chosen } if (Object.keys(entry).length > 0) overrides.set(endpointId, entry) diff --git a/testing/e2e/src/components/VideoGenUI.tsx b/testing/e2e/src/components/VideoGenUI.tsx index 85f94ee83..1b068d741 100644 --- a/testing/e2e/src/components/VideoGenUI.tsx +++ b/testing/e2e/src/components/VideoGenUI.tsx @@ -27,7 +27,12 @@ function fileToBase64(file: File): Promise { reject(new Error('Unexpected FileReader result')) return } - resolve(result.split(',')[1] ?? '') + const base64 = result.split(',')[1] + if (!base64) { + reject(new Error(`Unexpected data URL format: ${result.slice(0, 32)}…`)) + return + } + resolve(base64) } reader.onerror = () => reject(new Error('Failed to read file')) reader.readAsDataURL(file) From 1b5253394836619a763673fb53fba1db82beed81 Mon Sep 17 00:00:00 2001 From: Tom Beckenham <34339192+tombeckenham@users.noreply.github.com> Date: Thu, 11 Jun 2026 11:03:54 +1000 Subject: [PATCH 11/11] feat(ai,ai-gemini): add Google Veo video adapter on the typed-duration contract (#634) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restacked on 618-image-to-image-and-image-to-video-support to adopt the multimodal MediaPrompt format, carrying a minimal additive port of the #534 typed-duration contract: - @tanstack/ai (non-breaking): VideoAdapter/BaseVideoAdapter gain a TModelDurationByName generic (default Record preserves existing duration?: number typing), DurationOptions, snapToDurationOption, and default availableDurations()/snapDuration() implementations. generateVideo's duration is typed via VideoDurationForAdapter. - @tanstack/ai-gemini: GeminiVideoAdapter over generateVideos / getVideosOperation with per-model typed durations (Veo 3.x 4|6|8, Veo 2 5|6|8 per current Veo docs), MediaPrompt image routing (start_frame → image, end_frame → lastFrame, reference/character → referenceImages), RAI filter surfacing, geminiVideo/createGeminiVideo factories, and finalized Veo model-meta entries. - E2E: gemini added to video-gen with a custom aimock mount for :predictLongRunning + operations polling; all transports pass. - Docs + media-generation skill updated for Veo (typed durations, image-to-video role table). Co-Authored-By: Claude Fable 5 --- .changeset/gemini-veo-video-adapter.md | 42 ++ docs/media/video-generation.md | 80 ++- packages/ai-gemini/src/adapters/video.ts | 411 ++++++++++++++ packages/ai-gemini/src/index.ts | 25 + packages/ai-gemini/src/model-meta.ts | 53 +- .../src/video/video-provider-options.ts | 126 +++++ .../ai-gemini/tests/video-adapter.test.ts | 518 ++++++++++++++++++ .../skills/ai-core/media-generation/SKILL.md | 30 +- .../src/activities/generateVideo/adapter.ts | 74 ++- .../ai/src/activities/generateVideo/index.ts | 28 +- .../ai/src/activities/generateVideo/snap.ts | 100 ++++ packages/ai/src/activities/index.ts | 4 + packages/ai/src/types.ts | 9 +- packages/ai/tests/stream-generation.test.ts | 3 + testing/e2e/global-setup.ts | 68 +++ testing/e2e/src/lib/feature-support.ts | 9 +- testing/e2e/src/lib/media-providers.ts | 10 +- 17 files changed, 1544 insertions(+), 46 deletions(-) create mode 100644 .changeset/gemini-veo-video-adapter.md create mode 100644 packages/ai-gemini/src/adapters/video.ts create mode 100644 packages/ai-gemini/src/video/video-provider-options.ts create mode 100644 packages/ai-gemini/tests/video-adapter.test.ts create mode 100644 packages/ai/src/activities/generateVideo/snap.ts diff --git a/.changeset/gemini-veo-video-adapter.md b/.changeset/gemini-veo-video-adapter.md new file mode 100644 index 000000000..555b90c2c --- /dev/null +++ b/.changeset/gemini-veo-video-adapter.md @@ -0,0 +1,42 @@ +--- +'@tanstack/ai': minor +'@tanstack/ai-gemini': minor +--- + +Add a Google Veo video adapter (`geminiVideo` / `createGeminiVideo`) and the +per-model typed-duration video contract it is built on (#534, #634). + +**`@tanstack/ai`** (additive, non-breaking): `VideoAdapter` / +`BaseVideoAdapter` gain a `TModelDurationByName` generic (defaulting to +`Record`, preserving today's `duration?: number` typing for +adapters without a map) plus two introspection methods with safe defaults: + +- `availableDurations()` — a `DurationOptions` tagged union + (`discrete | range | mixed | none`) describing the durations the current + model accepts. Default: `{ kind: 'none' }`. +- `snapDuration(seconds)` — coerce raw seconds to the closest valid duration + (`snapToDurationOption` is exported for adapter authors). Default: + `undefined`. + +`generateVideo({ duration })` is now typed per model via +`VideoDurationForAdapter`. + +**`@tanstack/ai-gemini`**: new Veo adapter over the long-running +`:predictLongRunning` operation, supporting `veo-3.1-generate-preview`, +`veo-3.1-fast-generate-preview`, `veo-3.0-generate-001`, +`veo-3.0-fast-generate-001`, and `veo-2.0-generate-001`: + +- `geminiVideo('veo-3.0-generate-001')` → `duration?: 4 | 6 | 8` + (Veo 2: `5 | 6 | 8`); `adapter.snapDuration(7)` → `6`. +- Multimodal prompts: the first un-roled / `'start_frame'` image part + becomes the input image, `'end_frame'` → `lastFrame`, `'reference'` / + `'character'` → `referenceImages`. +- `size` takes Veo aspect ratios (`'16:9' | '9:16'`); everything else from + the SDK's `GenerateVideosConfig` (e.g. `resolution`, `generateAudio`, + `negativePrompt`) is available through `modelOptions`. +- Responsible-AI filtering is surfaced as a failed job with the filter + reasons. + +Note: Veo result URLs are served by the Gemini Files API and require the +Google API key to download (`x-goog-api-key` header or `key` query +parameter). diff --git a/docs/media/video-generation.md b/docs/media/video-generation.md index dd175b3b6..eebbdf530 100644 --- a/docs/media/video-generation.md +++ b/docs/media/video-generation.md @@ -2,11 +2,13 @@ title: Video Generation id: video-generation order: 6 -description: "Generate video from text prompts with OpenAI Sora using TanStack AI's experimental generateVideo() jobs/polling API." +description: "Generate video from text prompts with OpenAI Sora or Google Veo using TanStack AI's experimental generateVideo() jobs/polling API." keywords: - tanstack ai - video generation - sora + - veo + - gemini - generateVideo - jobs api - experimental @@ -36,6 +38,7 @@ TanStack AI provides experimental support for video generation through dedicated Currently supported: - **OpenAI**: Sora-2 and Sora-2-Pro models (when available) +- **Google Gemini**: Veo 3.1, Veo 3, and Veo 2 models (via the long-running operations API) ## Basic Usage @@ -417,9 +420,9 @@ adapter uses to route the input to the provider-specific field: | Role | Maps to | | --------------- | ------------------------------------------------------------- | -| `'start_frame'` | fal `start_image_url` (positional default for the first input) | -| `'end_frame'` | fal `end_image_url` (Veo `lastFrame` planned — no Veo adapter yet) | -| `'reference'` | fal `reference_image_urls` (Veo `referenceImages` planned) | +| `'start_frame'` | fal `start_image_url`, Veo input `image` (positional default for the first input) | +| `'end_frame'` | fal `end_image_url`, Veo `lastFrame` | +| `'reference'` | fal `reference_image_urls`, Veo `referenceImages` | | `'character'` | Same as `'reference'` — character consistency images | ```typescript @@ -445,7 +448,7 @@ await generateVideo({ | ------------ | -------------------------------------------------------------------------------------------------------- | | **OpenAI** | Sora-2 / Sora-2-Pro → the image part goes to `input_reference`; flattened text is the prompt. Single image only — throws if more than one. | | **fal.ai** | Field names resolve per endpoint from a map generated from the fal SDK's endpoint types — e.g. `role: 'start_frame'` lands on `image_url` for Kling/Veo image-to-video, `first_frame_url` for first-last-frame endpoints, and `start_image_url` otherwise. Defaults: single input → `image_url` (start frame); `role: 'end_frame'` → `end_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`. Override per-endpoint via `modelOptions` — the media-conditioning fields are typed optional there (even when the endpoint requires them) since they usually arrive as prompt parts. | -| **Gemini** | Veo adapter not yet implemented — image prompt parts will be supported when Veo lands. | +| **Gemini** | Veo → the first un-roled / `'start_frame'` image becomes the input image; `'end_frame'` → `lastFrame`; `'reference'` / `'character'` → `referenceImages` (asset references, Veo 3.1). Throws on multiple starting images. | Adapters whose underlying API can't accept image inputs throw a clear runtime error so calls fail fast. @@ -488,6 +491,67 @@ const { jobId } = await generateVideo({ }) ``` +### Google Veo (Gemini) Model Options + +Veo runs on Google's long-running operations API. The adapter starts the +operation, and `getVideoJobStatus` polls it until the video is ready: + +```typescript +import { generateVideo } from '@tanstack/ai' +import { geminiVideo } from '@tanstack/ai-gemini' + +const adapter = geminiVideo('veo-3.1-generate-preview') + +const { jobId } = await generateVideo({ + adapter, + prompt: 'A close-up of a luthier carving a guitar neck', + size: '16:9', // aspect ratio: '16:9' or '9:16' + duration: 8, // typed per model — see below + modelOptions: { + resolution: '1080p', // '720p' (default), '1080p', '4k' (Veo 3.1 only) + negativePrompt: 'cartoon, low quality', + generateAudio: true, // Veo 3+ generates synchronized audio + }, +}) +``` + +#### Typed durations + +Each Veo model accepts a fixed set of durations, enforced at compile time on +the `duration` option: + +| Model | `duration` values (seconds) | +|-------|------------------------------| +| `veo-3.1-generate-preview` | `4`, `6`, `8` | +| `veo-3.1-fast-generate-preview` | `4`, `6`, `8` | +| `veo-3.0-generate-001` | `4`, `6`, `8` | +| `veo-3.0-fast-generate-001` | `4`, `6`, `8` | +| `veo-2.0-generate-001` | `5`, `6`, `8` | + +If you have raw seconds (for example from a UI slider), coerce them with +`snapDuration`, or inspect the full set with `availableDurations`: + +```typescript +const adapter = geminiVideo('veo-3.0-generate-001') + +adapter.availableDurations() // { kind: 'discrete', values: [4, 6, 8] } +adapter.snapDuration(7) // 6 — closest valid duration + +await generateVideo({ + adapter, + prompt: 'A timelapse of a city skyline at dusk', + duration: adapter.snapDuration(7), +}) +``` + +Adapters that haven't declared a per-model duration map keep the plain +`duration?: number` typing, return `{ kind: 'none' }` from +`availableDurations()`, and return `undefined` from `snapDuration()`. + +> **Note:** The video URL returned for Veo jobs is served by the Gemini +> Files API and requires your API key to download (send it as an +> `x-goog-api-key` header or `key` query parameter). + ## Response Types > **Note:** The interfaces below are the underlying adapter-level types. The `getVideoJobStatus()` helper returns a single merged object, `{ status, progress?, url?, error?, usage? }` — it does not return `jobId` or `expiresAt`. @@ -586,9 +650,11 @@ Check the [OpenAI documentation](https://platform.openai.com/docs) for current l ## Environment Variables -The video adapter uses the same environment variable as other OpenAI adapters: +The video adapters use the same environment variables as the other adapters +for their provider: -- `OPENAI_API_KEY`: Your OpenAI API key +- `OPENAI_API_KEY`: Your OpenAI API key (Sora) +- `GOOGLE_API_KEY` or `GEMINI_API_KEY`: Your Google API key (Veo) ## Explicit API Keys diff --git a/packages/ai-gemini/src/adapters/video.ts b/packages/ai-gemini/src/adapters/video.ts new file mode 100644 index 000000000..b6935e503 --- /dev/null +++ b/packages/ai-gemini/src/adapters/video.ts @@ -0,0 +1,411 @@ +import { + GenerateVideosOperation, + VideoGenerationReferenceType, +} from '@google/genai' +import { resolveMediaPrompt } from '@tanstack/ai' +import { BaseVideoAdapter, snapToDurationOption } from '@tanstack/ai/adapters' +import { arrayBufferToBase64 } from '@tanstack/ai-utils' +import { createGeminiClient, getGeminiApiKeyFromEnv } from '../utils' +import { getGeminiVideoDurationOptions } from '../video/video-provider-options' +import type { DurationOptions } from '@tanstack/ai/adapters' +import type { + ImagePart, + MediaInputMetadata, + VideoGenerationOptions, + VideoJobResult, + VideoStatusResult, + VideoUrlResult, +} from '@tanstack/ai' +import type { + GenerateVideosConfig, + GoogleGenAI, + Image, + VideoGenerationReferenceImage, +} from '@google/genai' +import type { + GeminiVideoModel, + GeminiVideoModelDurationByName, + GeminiVideoModelInputModalitiesByName, + GeminiVideoModelProviderOptionsByName, + GeminiVideoModelSizeByName, + GeminiVideoProviderOptions, + GeminiVideoSize, +} from '../video/video-provider-options' +import type { GeminiClientConfig } from '../utils' + +/** + * Configuration for Gemini video adapter. + * + * @experimental Video generation is an experimental feature and may change. + */ +export interface GeminiVideoConfig extends GeminiClientConfig {} + +/** + * Extract a human-readable message from a long-running operation's error, + * which the SDK types as `Record` (a google.rpc.Status). + */ +function operationErrorMessage(error: Record): string { + if (typeof error.message === 'string' && error.message.length > 0) { + return error.message + } + return JSON.stringify(error) +} + +/** + * Convert a TanStack image prompt part into the genai `Image` shape Veo + * accepts: base64 `imageBytes` (data sources, data: URIs, fetched HTTP + * URLs) or a `gcsUri` passthrough for Cloud Storage references. + */ +async function imagePartToVeoImage( + part: ImagePart, +): Promise { + if (part.source.type === 'data') { + return { + imageBytes: part.source.value, + mimeType: part.source.mimeType || 'image/png', + } + } + const url = part.source.value + if (url.startsWith('gs://')) { + return { + gcsUri: url, + ...(part.source.mimeType && { mimeType: part.source.mimeType }), + } + } + if (url.startsWith('data:')) { + const match = url.match(/^data:([^;,]+)?(;base64)?,(.*)$/) + if (!match || !match[2]) { + throw new Error( + 'gemini: only base64 data: URIs are supported for video image inputs.', + ) + } + return { + imageBytes: match[3] ?? '', + mimeType: match[1] || part.source.mimeType || 'image/png', + } + } + const response = await fetch(url) + if (!response.ok) { + throw new Error( + `Failed to fetch image input (${response.status} ${response.statusText}): ${url}`, + ) + } + const blob = await response.blob() + const buffer = await blob.arrayBuffer() + return { + imageBytes: arrayBufferToBase64(buffer), + mimeType: part.source.mimeType || blob.type || 'image/png', + } +} + +/** + * Gemini Veo Video Generation Adapter + * + * Tree-shakeable adapter for Google Veo video generation. Veo runs as a + * long-running operation: `createVideoJob` starts the operation via the + * `:predictLongRunning` endpoint, `getVideoStatus` polls it, and + * `getVideoUrl` extracts the generated video's URI once it completes. + * + * Image prompt parts are routed by `metadata.role`: + * - `'start_frame'` (or the first un-roled image) → the input image the + * video starts from + * - `'end_frame'` → `lastFrame` (the frame the video ends on) + * - `'reference'` / `'character'` → `referenceImages` (asset references, + * Veo 3.1) + * + * Note: the returned video URI is served by the Gemini Files API and + * requires the API key (`x-goog-api-key` header or `?key=` query + * parameter) to download. + * + * @experimental Video generation is an experimental feature and may change. + */ +export class GeminiVideoAdapter< + TModel extends GeminiVideoModel, +> extends BaseVideoAdapter< + TModel, + GeminiVideoProviderOptions, + GeminiVideoModelProviderOptionsByName, + GeminiVideoModelSizeByName, + GeminiVideoModelInputModalitiesByName, + GeminiVideoModelDurationByName +> { + readonly name = 'gemini' as const + + protected client: GoogleGenAI + + constructor(config: GeminiVideoConfig, model: TModel) { + super({}, model) + this.client = createGeminiClient(config) + } + + async createVideoJob( + options: VideoGenerationOptions< + GeminiVideoProviderOptions, + GeminiVideoSize, + GeminiVideoModelDurationByName[TModel] + >, + ): Promise { + const { prompt, size, duration, modelOptions, logger } = options + + logger.request( + `activity=video.create provider=${this.name} model=${this.model} size=${size ?? 'default'} duration=${duration ?? 'default'}`, + { provider: this.name, model: this.model }, + ) + + try { + const resolved = resolveMediaPrompt(prompt) + + if (resolved.videos.length > 0) { + throw new Error( + `${this.name}.createVideoJob does not support video prompt parts (model: ${this.model}).`, + ) + } + if (resolved.audios.length > 0) { + throw new Error( + `${this.name}.createVideoJob does not support audio prompt parts (model: ${this.model}).`, + ) + } + + const { image, lastFrame, referenceImages } = await this.routeImageParts( + resolved.images, + ) + + const config: GenerateVideosConfig = { + ...modelOptions, + ...(size !== undefined && { aspectRatio: size }), + ...(duration !== undefined && { durationSeconds: duration }), + ...(lastFrame && { lastFrame }), + ...(referenceImages.length > 0 && { referenceImages }), + } + + const operation = await this.client.models.generateVideos({ + model: this.model, + prompt: resolved.text, + ...(image && { image }), + config, + }) + + if (!operation.name) { + throw new Error( + 'Veo did not return an operation name for the video generation job.', + ) + } + + return { jobId: operation.name, model: this.model } + } catch (error) { + logger.errors(`${this.name}.createVideoJob fatal`, { + error, + source: `${this.name}.createVideoJob`, + }) + throw error + } + } + + /** + * Route image prompt parts onto Veo's request fields by `metadata.role`. + */ + private async routeImageParts( + parts: Array>, + ): Promise<{ + image: Image | undefined + lastFrame: Image | undefined + referenceImages: Array + }> { + let image: Image | undefined + let lastFrame: Image | undefined + const referenceImages: Array = [] + + for (const part of parts) { + const role = part.metadata?.role + switch (role) { + case 'end_frame': { + if (lastFrame) { + throw new Error( + `${this.name}: Veo accepts at most one 'end_frame' image.`, + ) + } + lastFrame = await imagePartToVeoImage(part) + break + } + case 'reference': + case 'character': { + referenceImages.push({ + image: await imagePartToVeoImage(part), + referenceType: VideoGenerationReferenceType.ASSET, + }) + break + } + case 'start_frame': + case undefined: { + if (image) { + throw new Error( + `${this.name}: Veo accepts at most one starting image; received multiple 'start_frame'/un-roled images. Use metadata.role ('end_frame', 'reference') to disambiguate the others.`, + ) + } + image = await imagePartToVeoImage(part) + break + } + case 'mask': + case 'control': + throw new Error( + `${this.name}: unsupported image role "${role}" for Veo video generation.`, + ) + } + } + + return { image, lastFrame, referenceImages } + } + + async getVideoStatus(jobId: string): Promise { + const operation = await this.getOperation(jobId) + + if (!operation.done) { + return { jobId, status: 'processing' } + } + + if (operation.error) { + return { + jobId, + status: 'failed', + error: operationErrorMessage(operation.error), + } + } + + // The operation can finish "successfully" with every sample dropped by + // Responsible-AI filters — surface that as a failure instead of letting + // getVideoUrl() throw on an empty response. + const videos = operation.response?.generatedVideos ?? [] + if (videos.length === 0) { + const reasons = operation.response?.raiMediaFilteredReasons + return { + jobId, + status: 'failed', + error: reasons?.length + ? `Video was filtered by Responsible-AI: ${reasons.join('; ')}` + : 'Veo returned no generated videos.', + } + } + + return { jobId, status: 'completed' } + } + + async getVideoUrl(jobId: string): Promise { + const operation = await this.getOperation(jobId) + + if (!operation.done) { + throw new Error( + `Video is not ready yet. Check status first. Job ID: ${jobId}`, + ) + } + + if (operation.error) { + throw new Error( + `Video generation failed: ${operationErrorMessage(operation.error)}`, + ) + } + + const uri = operation.response?.generatedVideos?.[0]?.video?.uri + if (!uri) { + const reasons = operation.response?.raiMediaFilteredReasons + throw new Error( + reasons?.length + ? `Video was filtered by Responsible-AI: ${reasons.join('; ')}` + : `Video URL not found in operation response. Job ID: ${jobId}`, + ) + } + + return { jobId, url: uri } + } + + override availableDurations(): DurationOptions< + GeminiVideoModelDurationByName[TModel] + > { + return getGeminiVideoDurationOptions(this.model) + } + + override snapDuration( + seconds: number, + ): GeminiVideoModelDurationByName[TModel] | undefined { + return snapToDurationOption(seconds, this.availableDurations()) + } + + /** + * Fetch the long-running operation by name. The SDK's + * `operations.getVideosOperation` needs a real `GenerateVideosOperation` + * instance (it calls `_fromAPIResponse` on it), so reconstruct one from + * the job ID rather than passing an object literal. + */ + private async getOperation(jobId: string): Promise { + const operation = new GenerateVideosOperation() + operation.name = jobId + return await this.client.operations.getVideosOperation({ operation }) + } +} + +/** + * Creates a Gemini video adapter with an explicit API key. + * Type resolution happens here at the call site. + * + * @experimental Video generation is an experimental feature and may change. + * + * @param model - The model name (e.g., 'veo-3.1-generate-preview') + * @param apiKey - Your Google API key + * @param config - Optional additional configuration + * @returns Configured Gemini video adapter instance with resolved types + * + * @example + * ```typescript + * const adapter = createGeminiVideo('veo-3.1-generate-preview', 'your-api-key'); + * + * const { jobId } = await generateVideo({ + * adapter, + * prompt: 'A beautiful sunset over the ocean', + * duration: adapter.snapDuration(7), // → 6 + * }); + * ``` + */ +export function createGeminiVideo( + model: TModel, + apiKey: string, + config?: Omit, +): GeminiVideoAdapter { + return new GeminiVideoAdapter({ apiKey, ...config }, model) +} + +/** + * Creates a Gemini video adapter with automatic API key detection from environment variables. + * Type resolution happens here at the call site. + * + * Looks for `GOOGLE_API_KEY` or `GEMINI_API_KEY` in: + * - `process.env` (Node.js) + * - `window.env` (Browser with injected env) + * + * @experimental Video generation is an experimental feature and may change. + * + * @param model - The model name (e.g., 'veo-3.1-generate-preview') + * @param config - Optional configuration (excluding apiKey which is auto-detected) + * @returns Configured Gemini video adapter instance with resolved types + * @throws Error if GOOGLE_API_KEY or GEMINI_API_KEY is not found in environment + * + * @example + * ```typescript + * // Automatically uses GOOGLE_API_KEY from environment + * const adapter = geminiVideo('veo-3.1-generate-preview'); + * + * // Create a video generation job + * const { jobId } = await generateVideo({ + * adapter, + * prompt: 'A cat playing piano' + * }); + * + * // Poll for status + * const status = await getVideoJobStatus({ adapter, jobId }); + * ``` + */ +export function geminiVideo( + model: TModel, + config?: Omit, +): GeminiVideoAdapter { + const apiKey = getGeminiApiKeyFromEnv() + return createGeminiVideo(model, apiKey, config) +} diff --git a/packages/ai-gemini/src/index.ts b/packages/ai-gemini/src/index.ts index f204c184d..462de4067 100644 --- a/packages/ai-gemini/src/index.ts +++ b/packages/ai-gemini/src/index.ts @@ -61,6 +61,30 @@ export { type GeminiAudioProviderOptions, } from './adapters/audio' +// Video / Veo generation adapter (experimental) +/** + * @experimental Veo video generation is an experimental feature and may change. + */ +export { + GeminiVideoAdapter, + createGeminiVideo, + geminiVideo, + type GeminiVideoConfig, +} from './adapters/video' +export { + GEMINI_VIDEO_DURATIONS, + getGeminiVideoDurationOptions, +} from './video/video-provider-options' +export type { + GeminiVideoModel, + GeminiVideoModelDurationByName, + GeminiVideoModelInputModalitiesByName, + GeminiVideoModelProviderOptionsByName, + GeminiVideoModelSizeByName, + GeminiVideoProviderOptions, + GeminiVideoSize, +} from './video/video-provider-options' + // Re-export models from model-meta for convenience export { GEMINI_MODELS, @@ -71,6 +95,7 @@ export { GEMINI_IMAGE_MODELS as GeminiImageModels } from './model-meta' export { GEMINI_TTS_MODELS as GeminiTTSModels } from './model-meta' export { GEMINI_TTS_VOICES as GeminiTTSVoices } from './model-meta' export { GEMINI_AUDIO_MODELS as GeminiAudioModels } from './model-meta' +export { GEMINI_VIDEO_MODELS as GeminiVideoModels } from './model-meta' export type { GeminiModels as GeminiTextModel } from './model-meta' export type { GeminiImageModels as GeminiImageModel } from './model-meta' export type { GeminiTTSVoice } from './model-meta' diff --git a/packages/ai-gemini/src/model-meta.ts b/packages/ai-gemini/src/model-meta.ts index 610bd2ef6..150b23fb2 100644 --- a/packages/ai-gemini/src/model-meta.ts +++ b/packages/ai-gemini/src/model-meta.ts @@ -631,7 +631,11 @@ const IMAGEN_3 = { GeminiCommonConfigOptions & GeminiCachedContentOptions > -/** +/** + * Veo video generation models. Pricing is per second of generated video + * (audio+video rate where the model supports audio). + * @experimental Veo video generation is an experimental feature and may change. + */ const VEO_3_1_PREVIEW = { name: 'veo-3.1-generate-preview', max_input_tokens: 1024, @@ -650,9 +654,9 @@ const VEO_3_1_PREVIEW = { }, } as const satisfies ModelMeta< GeminiToolConfigOptions & - GeminiSafetyOptions & - GeminiGenerationConfigOptions & - GeminiCachedContentOptions + GeminiSafetyOptions & + GeminiCommonConfigOptions & + GeminiCachedContentOptions > const VEO_3_1_FAST_PREVIEW = { @@ -673,9 +677,9 @@ const VEO_3_1_FAST_PREVIEW = { }, } as const satisfies ModelMeta< GeminiToolConfigOptions & - GeminiSafetyOptions & - GeminiGenerationConfigOptions & - GeminiCachedContentOptions + GeminiSafetyOptions & + GeminiCommonConfigOptions & + GeminiCachedContentOptions > const VEO_3 = { @@ -696,9 +700,9 @@ const VEO_3 = { }, } as const satisfies ModelMeta< GeminiToolConfigOptions & - GeminiSafetyOptions & - GeminiGenerationConfigOptions & - GeminiCachedContentOptions + GeminiSafetyOptions & + GeminiCommonConfigOptions & + GeminiCachedContentOptions > const VEO_3_FAST = { @@ -719,9 +723,9 @@ const VEO_3_FAST = { }, } as const satisfies ModelMeta< GeminiToolConfigOptions & - GeminiSafetyOptions & - GeminiGenerationConfigOptions & - GeminiCachedContentOptions + GeminiSafetyOptions & + GeminiCommonConfigOptions & + GeminiCachedContentOptions > const VEO_2 = { @@ -741,10 +745,10 @@ const VEO_2 = { }, } as const satisfies ModelMeta< GeminiToolConfigOptions & - GeminiSafetyOptions & - GeminiGenerationConfigOptions & - GeminiCachedContentOptions -> */ + GeminiSafetyOptions & + GeminiCommonConfigOptions & + GeminiCachedContentOptions +> const GEMINI_3_5_FLASH = { name: 'gemini-3.5-flash', @@ -878,20 +882,17 @@ export const GEMINI_TTS_VOICES = [ export type GeminiTTSVoice = (typeof GEMINI_TTS_VOICES)[number] -/* const GEMINI_AUDIO_MODELS = [ - GEMINI_2_5_PRO_TTS.name, - GEMINI_2_5_FLASH_TTS.name, - GEMINI_2_5_FLASH_LIVE.name, - GEMINI_2_FLASH_LIVE.name, -] as const - - const GEMINI_VIDEO_MODELS = [ +/** + * Veo video generation models. + * @experimental Veo video generation is an experimental feature and may change. + */ +export const GEMINI_VIDEO_MODELS = [ VEO_3_1_PREVIEW.name, VEO_3_1_FAST_PREVIEW.name, VEO_3.name, VEO_3_FAST.name, VEO_2.name, -] as const */ +] as const // Manual type map for per-model provider options export type GeminiChatModelProviderOptionsByName = { diff --git a/packages/ai-gemini/src/video/video-provider-options.ts b/packages/ai-gemini/src/video/video-provider-options.ts new file mode 100644 index 000000000..b1fd5671a --- /dev/null +++ b/packages/ai-gemini/src/video/video-provider-options.ts @@ -0,0 +1,126 @@ +/** + * Gemini Veo Video Generation Provider Options + * + * Based on https://ai.google.dev/gemini-api/docs/video + * + * @experimental Video generation is an experimental feature and may change. + */ +import type { DurationOptions } from '@tanstack/ai/adapters' +import type { GenerateVideosConfig } from '@google/genai' +import type { GEMINI_VIDEO_MODELS } from '../model-meta' + +/** + * Model type for Gemini Veo video generation. + * @experimental Video generation is an experimental feature and may change. + */ +export type GeminiVideoModel = (typeof GEMINI_VIDEO_MODELS)[number] + +/** + * Supported aspect ratios for Veo video generation. This is the `size` value + * for the Gemini video adapter — Veo expresses output shape as an aspect + * ratio (plus an optional `resolution` in `modelOptions`), not pixel + * dimensions. + * + * @experimental Video generation is an experimental feature and may change. + */ +export type GeminiVideoSize = '16:9' | '9:16' + +/** + * Provider-specific options for Gemini Veo video generation. + * + * Derived from the SDK's `GenerateVideosConfig`, minus the fields the + * adapter manages itself: + * - `durationSeconds` — set via the typed top-level `duration` option + * (use `adapter.snapDuration(seconds)` to coerce raw seconds) + * - `aspectRatio` — set via the top-level `size` option + * - `lastFrame` / `referenceImages` — set via image parts in the `prompt` + * with `metadata.role: 'end_frame'` / `'reference'` + * - `httpOptions` / `abortSignal` — client-level transport concerns + * + * @experimental Video generation is an experimental feature and may change. + */ +export type GeminiVideoProviderOptions = Omit< + GenerateVideosConfig, + | 'durationSeconds' + | 'aspectRatio' + | 'lastFrame' + | 'referenceImages' + | 'httpOptions' + | 'abortSignal' +> + +/** + * Model-specific provider options mapping. + * + * @experimental Video generation is an experimental feature and may change. + */ +export type GeminiVideoModelProviderOptionsByName = { + [TModel in GeminiVideoModel]: GeminiVideoProviderOptions +} + +/** + * Model-specific size (aspect ratio) mapping. + * + * @experimental Video generation is an experimental feature and may change. + */ +export type GeminiVideoModelSizeByName = { + [TModel in GeminiVideoModel]: GeminiVideoSize +} + +/** + * Per-model prompt input modalities. Every Veo model accepts image + * conditioning inputs (first frame, last frame, reference images) alongside + * the text prompt. + * + * @experimental Video generation is an experimental feature and may change. + */ +export type GeminiVideoModelInputModalitiesByName = { + [TModel in GeminiVideoModel]: readonly ['image'] +} + +/** + * Per-model duration unions (seconds, as numbers — the API's + * `parameters.durationSeconds` field is numeric). + * + * @experimental Video generation is an experimental feature and may change. + */ +export type GeminiVideoModelDurationByName = { + 'veo-3.1-generate-preview': 4 | 6 | 8 + 'veo-3.1-fast-generate-preview': 4 | 6 | 8 + 'veo-3.0-generate-001': 4 | 6 | 8 + 'veo-3.0-fast-generate-001': 4 | 6 | 8 + 'veo-2.0-generate-001': 5 | 6 | 8 +} + +/** + * Runtime duration table backing `availableDurations()` / `snapDuration()`. + * + * Curated from the official Veo docs + * (https://ai.google.dev/gemini-api/docs/video) — the Gemini OpenAPI spec + * types the `:predictLongRunning` request's `parameters` as unconstrained, + * so it carries no per-model duration information to derive these from. + * + * @experimental Video generation is an experimental feature and may change. + */ +export const GEMINI_VIDEO_DURATIONS: { + readonly [TModel in GeminiVideoModel]: DurationOptions< + GeminiVideoModelDurationByName[TModel] + > +} = { + 'veo-3.1-generate-preview': { kind: 'discrete', values: [4, 6, 8] }, + 'veo-3.1-fast-generate-preview': { kind: 'discrete', values: [4, 6, 8] }, + 'veo-3.0-generate-001': { kind: 'discrete', values: [4, 6, 8] }, + 'veo-3.0-fast-generate-001': { kind: 'discrete', values: [4, 6, 8] }, + 'veo-2.0-generate-001': { kind: 'discrete', values: [5, 6, 8] }, +} + +/** + * Look up the duration options for a Veo model. + * + * @experimental Video generation is an experimental feature and may change. + */ +export function getGeminiVideoDurationOptions( + model: TModel, +): DurationOptions { + return GEMINI_VIDEO_DURATIONS[model] +} diff --git a/packages/ai-gemini/tests/video-adapter.test.ts b/packages/ai-gemini/tests/video-adapter.test.ts new file mode 100644 index 000000000..1e5945e01 --- /dev/null +++ b/packages/ai-gemini/tests/video-adapter.test.ts @@ -0,0 +1,518 @@ +import { describe, expect, expectTypeOf, it, vi } from 'vitest' +import { resolveDebugOption } from '@tanstack/ai/adapter-internals' +import { + GeminiVideoAdapter, + createGeminiVideo, + geminiVideo, +} from '../src/adapters/video' +import { + GEMINI_VIDEO_DURATIONS, + getGeminiVideoDurationOptions, +} from '../src/video/video-provider-options' +import type { GenerateVideosOperation, GoogleGenAI } from '@google/genai' +import type { GeminiVideoModel } from '../src/video/video-provider-options' + +const testLogger = resolveDebugOption(false) + +interface ClientStub { + models: { generateVideos: ReturnType } + operations: { getVideosOperation: ReturnType } +} + +function createClientStub( + overrides: { + createResult?: Partial + pollResult?: Partial + } = {}, +): ClientStub { + return { + models: { + generateVideos: vi.fn().mockResolvedValue( + overrides.createResult ?? { + name: 'models/veo-3.1-generate-preview/operations/op-123', + }, + ), + }, + operations: { + getVideosOperation: vi.fn().mockResolvedValue( + overrides.pollResult ?? { + name: 'models/veo-3.1-generate-preview/operations/op-123', + done: true, + response: { + generatedVideos: [ + { video: { uri: 'https://example.com/video.mp4' } }, + ], + }, + }, + ), + }, + } +} + +/** + * Test subclass that injects a stubbed GoogleGenAI client through the + * protected `client` seam instead of patching globals. + */ +class StubbedGeminiVideoAdapter< + TModel extends GeminiVideoModel, +> extends GeminiVideoAdapter { + constructor(model: TModel, stub: ClientStub) { + super({ apiKey: 'test-key' }, model) + this.client = stub as unknown as GoogleGenAI + } +} + +describe('Gemini Video Adapter', () => { + describe('factories', () => { + it('creates an adapter with the provided API key', () => { + const adapter = createGeminiVideo('veo-3.1-generate-preview', 'test-key') + expect(adapter).toBeInstanceOf(GeminiVideoAdapter) + expect(adapter.kind).toBe('video') + expect(adapter.name).toBe('gemini') + expect(adapter.model).toBe('veo-3.1-generate-preview') + }) + + it('geminiVideo throws without an API key in the environment', () => { + const googleKey = process.env.GOOGLE_API_KEY + const geminiKey = process.env.GEMINI_API_KEY + delete process.env.GOOGLE_API_KEY + delete process.env.GEMINI_API_KEY + try { + expect(() => geminiVideo('veo-3.1-generate-preview')).toThrow( + /GOOGLE_API_KEY or GEMINI_API_KEY/, + ) + } finally { + if (googleKey !== undefined) process.env.GOOGLE_API_KEY = googleKey + if (geminiKey !== undefined) process.env.GEMINI_API_KEY = geminiKey + } + }) + }) + + describe('availableDurations', () => { + it('returns the discrete Veo 3.x duration set', () => { + const adapter = createGeminiVideo('veo-3.0-generate-001', 'test-key') + expect(adapter.availableDurations()).toEqual({ + kind: 'discrete', + values: [4, 6, 8], + }) + }) + + it('returns the discrete Veo 2 duration set', () => { + const adapter = createGeminiVideo('veo-2.0-generate-001', 'test-key') + expect(adapter.availableDurations()).toEqual({ + kind: 'discrete', + values: [5, 6, 8], + }) + }) + + it('covers every model in the duration table', () => { + for (const model of Object.keys( + GEMINI_VIDEO_DURATIONS, + ) as Array) { + expect(getGeminiVideoDurationOptions(model).kind).toBe('discrete') + } + }) + }) + + describe('snapDuration', () => { + it('snaps to the closest valid duration', () => { + const adapter = createGeminiVideo('veo-3.0-generate-001', 'test-key') + expect(adapter.snapDuration(3)).toBe(4) + expect(adapter.snapDuration(5)).toBe(4) + expect(adapter.snapDuration(7)).toBe(6) + expect(adapter.snapDuration(100)).toBe(8) + }) + + it('snaps Veo 2 values to its own set', () => { + const adapter = createGeminiVideo('veo-2.0-generate-001', 'test-key') + expect(adapter.snapDuration(1)).toBe(5) + expect(adapter.snapDuration(7)).toBe(6) + expect(adapter.snapDuration(9)).toBe(8) + }) + }) + + describe('per-model duration typing', () => { + it('types duration as the model-specific union at compile time', () => { + const veo3 = createGeminiVideo('veo-3.0-generate-001', 'test-key') + expectTypeOf(veo3.snapDuration).returns.toEqualTypeOf< + 4 | 6 | 8 | undefined + >() + type Veo3Options = Parameters[0] + expectTypeOf().toEqualTypeOf< + 4 | 6 | 8 | undefined + >() + + const veo2 = createGeminiVideo('veo-2.0-generate-001', 'test-key') + expectTypeOf(veo2.snapDuration).returns.toEqualTypeOf< + 5 | 6 | 8 | undefined + >() + type Veo2Options = Parameters[0] + expectTypeOf().toEqualTypeOf< + 5 | 6 | 8 | undefined + >() + }) + }) + + describe('createVideoJob', () => { + it('starts a long-running operation and returns its name as jobId', async () => { + const stub = createClientStub() + const adapter = new StubbedGeminiVideoAdapter( + 'veo-3.1-generate-preview', + stub, + ) + + const result = await adapter.createVideoJob({ + model: 'veo-3.1-generate-preview', + prompt: 'a guitar being played in a store', + size: '16:9', + duration: 6, + modelOptions: { negativePrompt: 'blurry footage' }, + logger: testLogger, + }) + + expect(result).toEqual({ + jobId: 'models/veo-3.1-generate-preview/operations/op-123', + model: 'veo-3.1-generate-preview', + }) + expect(stub.models.generateVideos).toHaveBeenCalledWith({ + model: 'veo-3.1-generate-preview', + prompt: 'a guitar being played in a store', + config: { + negativePrompt: 'blurry footage', + aspectRatio: '16:9', + durationSeconds: 6, + }, + }) + }) + + it('omits aspectRatio and durationSeconds when size/duration are not given', async () => { + const stub = createClientStub() + const adapter = new StubbedGeminiVideoAdapter( + 'veo-2.0-generate-001', + stub, + ) + + await adapter.createVideoJob({ + model: 'veo-2.0-generate-001', + prompt: 'a sunset', + logger: testLogger, + }) + + expect(stub.models.generateVideos).toHaveBeenCalledWith({ + model: 'veo-2.0-generate-001', + prompt: 'a sunset', + config: {}, + }) + }) + + it('throws when the operation comes back without a name', async () => { + const stub = createClientStub({ createResult: {} }) + const adapter = new StubbedGeminiVideoAdapter( + 'veo-3.0-generate-001', + stub, + ) + + await expect( + adapter.createVideoJob({ + model: 'veo-3.0-generate-001', + prompt: 'a sunset', + logger: testLogger, + }), + ).rejects.toThrow(/operation name/) + }) + }) + + describe('multimodal prompt routing', () => { + const dataImage = (role?: 'start_frame' | 'end_frame' | 'reference') => + ({ + type: 'image', + source: { type: 'data', value: 'aGVsbG8=', mimeType: 'image/jpeg' }, + ...(role && { metadata: { role } }), + }) as const + + it('routes an un-roled image part to the input image', async () => { + const stub = createClientStub() + const adapter = new StubbedGeminiVideoAdapter( + 'veo-3.1-generate-preview', + stub, + ) + + await adapter.createVideoJob({ + model: 'veo-3.1-generate-preview', + prompt: [ + { type: 'text', content: 'animate this product photo' }, + dataImage(), + ], + logger: testLogger, + }) + + expect(stub.models.generateVideos).toHaveBeenCalledWith({ + model: 'veo-3.1-generate-preview', + prompt: 'animate this product photo', + image: { imageBytes: 'aGVsbG8=', mimeType: 'image/jpeg' }, + config: {}, + }) + }) + + it('routes end_frame and reference roles to lastFrame/referenceImages', async () => { + const stub = createClientStub() + const adapter = new StubbedGeminiVideoAdapter( + 'veo-3.1-generate-preview', + stub, + ) + + await adapter.createVideoJob({ + model: 'veo-3.1-generate-preview', + prompt: [ + { type: 'text', content: 'pan from start to end' }, + dataImage('start_frame'), + dataImage('end_frame'), + dataImage('reference'), + ], + logger: testLogger, + }) + + const call = stub.models.generateVideos.mock.calls[0]?.[0] + expect(call.image).toEqual({ + imageBytes: 'aGVsbG8=', + mimeType: 'image/jpeg', + }) + expect(call.config.lastFrame).toEqual({ + imageBytes: 'aGVsbG8=', + mimeType: 'image/jpeg', + }) + expect(call.config.referenceImages).toEqual([ + { + image: { imageBytes: 'aGVsbG8=', mimeType: 'image/jpeg' }, + referenceType: 'ASSET', + }, + ]) + }) + + it('decodes base64 data: URI image sources', async () => { + const stub = createClientStub() + const adapter = new StubbedGeminiVideoAdapter( + 'veo-3.0-generate-001', + stub, + ) + + await adapter.createVideoJob({ + model: 'veo-3.0-generate-001', + prompt: [ + { type: 'text', content: 'animate' }, + { + type: 'image', + source: { type: 'url', value: 'data:image/png;base64,aGVsbG8=' }, + }, + ], + logger: testLogger, + }) + + const call = stub.models.generateVideos.mock.calls[0]?.[0] + expect(call.image).toEqual({ + imageBytes: 'aGVsbG8=', + mimeType: 'image/png', + }) + }) + + it('rejects multiple starting images', async () => { + const stub = createClientStub() + const adapter = new StubbedGeminiVideoAdapter( + 'veo-3.1-generate-preview', + stub, + ) + + await expect( + adapter.createVideoJob({ + model: 'veo-3.1-generate-preview', + prompt: [ + { type: 'text', content: 'animate' }, + dataImage(), + dataImage(), + ], + logger: testLogger, + }), + ).rejects.toThrow(/at most one starting image/) + }) + + it('rejects video prompt parts', async () => { + const stub = createClientStub() + const adapter = new StubbedGeminiVideoAdapter( + 'veo-3.1-generate-preview', + stub, + ) + + await expect( + adapter.createVideoJob({ + model: 'veo-3.1-generate-preview', + prompt: [ + { type: 'text', content: 'extend this' }, + { + type: 'video', + source: { + type: 'data', + value: 'aGVsbG8=', + mimeType: 'video/mp4', + }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/video prompt parts/) + }) + }) + + describe('getVideoStatus', () => { + const jobId = 'models/veo-3.1-generate-preview/operations/op-123' + + it('polls the operation by job ID', async () => { + const stub = createClientStub() + const adapter = new StubbedGeminiVideoAdapter( + 'veo-3.1-generate-preview', + stub, + ) + + await adapter.getVideoStatus(jobId) + + const call = stub.operations.getVideosOperation.mock.calls[0]?.[0] as { + operation: GenerateVideosOperation + } + expect(call.operation.name).toBe(jobId) + }) + + it('maps an in-flight operation to processing', async () => { + const stub = createClientStub({ + pollResult: { name: jobId, done: false }, + }) + const adapter = new StubbedGeminiVideoAdapter( + 'veo-3.1-generate-preview', + stub, + ) + + expect(await adapter.getVideoStatus(jobId)).toEqual({ + jobId, + status: 'processing', + }) + }) + + it('maps a completed operation with videos to completed', async () => { + const stub = createClientStub() + const adapter = new StubbedGeminiVideoAdapter( + 'veo-3.1-generate-preview', + stub, + ) + + expect(await adapter.getVideoStatus(jobId)).toEqual({ + jobId, + status: 'completed', + }) + }) + + it('maps an operation error to failed with its message', async () => { + const stub = createClientStub({ + pollResult: { + name: jobId, + done: true, + error: { code: 3, message: 'Invalid duration' }, + }, + }) + const adapter = new StubbedGeminiVideoAdapter( + 'veo-3.1-generate-preview', + stub, + ) + + expect(await adapter.getVideoStatus(jobId)).toEqual({ + jobId, + status: 'failed', + error: 'Invalid duration', + }) + }) + + it('maps a fully RAI-filtered response to failed with the reasons', async () => { + const stub = createClientStub({ + pollResult: { + name: jobId, + done: true, + response: { + generatedVideos: [], + raiMediaFilteredCount: 1, + raiMediaFilteredReasons: ['unsafe content'], + }, + }, + }) + const adapter = new StubbedGeminiVideoAdapter( + 'veo-3.1-generate-preview', + stub, + ) + + const status = await adapter.getVideoStatus(jobId) + expect(status.status).toBe('failed') + expect(status.error).toContain('unsafe content') + }) + }) + + describe('getVideoUrl', () => { + const jobId = 'models/veo-3.1-generate-preview/operations/op-123' + + it('returns the generated video URI', async () => { + const stub = createClientStub() + const adapter = new StubbedGeminiVideoAdapter( + 'veo-3.1-generate-preview', + stub, + ) + + expect(await adapter.getVideoUrl(jobId)).toEqual({ + jobId, + url: 'https://example.com/video.mp4', + }) + }) + + it('throws when the operation is still running', async () => { + const stub = createClientStub({ + pollResult: { name: jobId, done: false }, + }) + const adapter = new StubbedGeminiVideoAdapter( + 'veo-3.1-generate-preview', + stub, + ) + + await expect(adapter.getVideoUrl(jobId)).rejects.toThrow(/not ready/) + }) + + it('throws with the operation error message on failure', async () => { + const stub = createClientStub({ + pollResult: { + name: jobId, + done: true, + error: { code: 13, message: 'internal error' }, + }, + }) + const adapter = new StubbedGeminiVideoAdapter( + 'veo-3.1-generate-preview', + stub, + ) + + await expect(adapter.getVideoUrl(jobId)).rejects.toThrow(/internal error/) + }) + + it('throws with RAI reasons when every sample was filtered', async () => { + const stub = createClientStub({ + pollResult: { + name: jobId, + done: true, + response: { + generatedVideos: [], + raiMediaFilteredCount: 1, + raiMediaFilteredReasons: ['unsafe content'], + }, + }, + }) + const adapter = new StubbedGeminiVideoAdapter( + 'veo-3.1-generate-preview', + stub, + ) + + await expect(adapter.getVideoUrl(jobId)).rejects.toThrow(/unsafe content/) + }) + }) +}) diff --git a/packages/ai/skills/ai-core/media-generation/SKILL.md b/packages/ai/skills/ai-core/media-generation/SKILL.md index af9d80cf0..cae40b000 100644 --- a/packages/ai/skills/ai-core/media-generation/SKILL.md +++ b/packages/ai/skills/ai-core/media-generation/SKILL.md @@ -3,8 +3,9 @@ name: ai-core/media-generation description: > Image, audio, video, speech (TTS), and transcription generation using activity-specific adapters: generateImage() with openaiImage/geminiImage, - generateAudio() with geminiAudio/falAudio, generateVideo() with async - polling, generateSpeech() with openaiSpeech, generateTranscription() with + generateAudio() with geminiAudio/falAudio, generateVideo() with + openaiVideo/geminiVideo (async polling, per-model typed durations), + generateSpeech() with openaiSpeech, generateTranscription() with openaiTranscription. React hooks: useGenerateImage, useGenerateAudio, useGenerateSpeech, useTranscription, useGenerateVideo. TanStack Start server function integration with toServerSentEventsResponse. @@ -428,6 +429,31 @@ const stream = generateVideo({ return toServerSentEventsResponse(stream) ``` +Google Veo (`@tanstack/ai-gemini`) uses the same jobs/polling flow. Its +`duration` option is typed per model (e.g. `4 | 6 | 8` for Veo 3.x, +`5 | 6 | 8` for Veo 2); use `adapter.snapDuration(seconds)` to coerce raw +seconds and `adapter.availableDurations()` to enumerate the valid set. +Image prompt parts route by `metadata.role`: first un-roled / +`'start_frame'` image → input image, `'end_frame'` → `lastFrame`, +`'reference'` / `'character'` → `referenceImages`: + +```typescript +import { geminiVideo } from '@tanstack/ai-gemini' + +const adapter = geminiVideo('veo-3.1-generate-preview') +adapter.availableDurations() // { kind: 'discrete', values: [4, 6, 8] } + +const { jobId } = await generateVideo({ + adapter, + prompt: 'A golden retriever playing in sunflowers', + size: '16:9', // Veo sizes are aspect ratios: '16:9' | '9:16' + duration: adapter.snapDuration(7), // 6 + modelOptions: { resolution: '1080p', generateAudio: true }, +}) +// Note: Veo result URLs require the Google API key to download +// (x-goog-api-key header or ?key= query parameter). +``` + Client hook with job tracking: ```tsx diff --git a/packages/ai/src/activities/generateVideo/adapter.ts b/packages/ai/src/activities/generateVideo/adapter.ts index 4f0eaed21..64dd0162e 100644 --- a/packages/ai/src/activities/generateVideo/adapter.ts +++ b/packages/ai/src/activities/generateVideo/adapter.ts @@ -6,6 +6,25 @@ import type { VideoUrlResult, } from '../../types' +/** + * Structured description of the durations a video model accepts. + * + * Tagged union so the same shape can express discrete enums (OpenAI Sora, + * Veo), continuous ranges, mixed shapes, and models with no duration field. + * Consumed by `VideoAdapter.availableDurations()`. + * + * @experimental Video generation is an experimental feature and may change. + */ +export type DurationOptions = + | { kind: 'discrete'; values: ReadonlyArray> } + | { kind: 'range'; min: number; max: number; step?: number; unit: 'seconds' } + | { + kind: 'mixed' + values: ReadonlyArray> + range?: { min: number; max: number; step?: number } + } + | { kind: 'none' } + /** * Configuration for video adapter instances * @@ -34,6 +53,9 @@ export interface VideoAdapterConfig { * - TModelSizeByName: Map from model name to its supported sizes * - TModelInputModalitiesByName: Map from model name to the non-text prompt * modalities it accepts (constrains the `prompt` part types at compile time) + * - TModelDurationByName: Map from model name to its supported duration + * union. Defaults to `Record` so adapters that haven't + * declared a map keep today's `duration?: number` typing. */ export interface VideoAdapter< TModel extends string = string, @@ -45,6 +67,8 @@ export interface VideoAdapter< >, TModelInputModalitiesByName extends ModelInputModalitiesByName = ModelInputModalitiesByName, + TModelDurationByName extends Record = + Record, > { /** Discriminator for adapter kind - used to determine API shape */ readonly kind: 'video' @@ -61,6 +85,7 @@ export interface VideoAdapter< modelProviderOptionsByName: TModelProviderOptionsByName modelSizeByName: TModelSizeByName modelInputModalitiesByName: TModelInputModalitiesByName + modelDurationByName: TModelDurationByName } /** @@ -68,7 +93,11 @@ export interface VideoAdapter< * Returns a job ID that can be used to poll for status and retrieve the video. */ createVideoJob: ( - options: VideoGenerationOptions, + options: VideoGenerationOptions< + TProviderOptions, + TModelSizeByName[TModel], + TModelDurationByName[TModel] + >, ) => Promise /** @@ -81,13 +110,26 @@ export interface VideoAdapter< * Should only be called after status is 'completed'. */ getVideoUrl: (jobId: string) => Promise + + /** + * Describe the durations this adapter's model accepts. Returns a tagged + * union so consumers can render UI / coerce input without provider-specific + * knowledge. + */ + availableDurations: () => DurationOptions + + /** + * Coerce a raw seconds value to the closest valid duration for this model. + * Returns `undefined` for models with no duration field. + */ + snapDuration: (seconds: number) => TModelDurationByName[TModel] | undefined } /** * A VideoAdapter with any/unknown type parameters. * Useful as a constraint in generic functions and interfaces. */ -export type AnyVideoAdapter = VideoAdapter +export type AnyVideoAdapter = VideoAdapter /** * Abstract base class for video generation adapters. @@ -107,12 +149,15 @@ export abstract class BaseVideoAdapter< >, TModelInputModalitiesByName extends ModelInputModalitiesByName = ModelInputModalitiesByName, + TModelDurationByName extends Record = + Record, > implements VideoAdapter< TModel, TProviderOptions, TModelProviderOptionsByName, TModelSizeByName, - TModelInputModalitiesByName + TModelInputModalitiesByName, + TModelDurationByName > { readonly kind = 'video' as const abstract readonly name: string @@ -124,6 +169,7 @@ export abstract class BaseVideoAdapter< modelProviderOptionsByName: TModelProviderOptionsByName modelSizeByName: TModelSizeByName modelInputModalitiesByName: TModelInputModalitiesByName + modelDurationByName: TModelDurationByName } protected config: VideoAdapterConfig @@ -134,13 +180,33 @@ export abstract class BaseVideoAdapter< } abstract createVideoJob( - options: VideoGenerationOptions, + options: VideoGenerationOptions< + TProviderOptions, + TModelSizeByName[TModel], + TModelDurationByName[TModel] + >, ): Promise abstract getVideoStatus(jobId: string): Promise abstract getVideoUrl(jobId: string): Promise + /** + * Default implementation returns `{ kind: 'none' }`. Adapters that have + * declared their per-model duration map should override this. + */ + availableDurations(): DurationOptions { + return { kind: 'none' } + } + + /** + * Default implementation returns `undefined`. Adapters that have declared + * their per-model duration map should override. + */ + snapDuration(_seconds: number): TModelDurationByName[TModel] | undefined { + return undefined + } + protected generateId(): string { return `${this.name}-${Date.now()}-${Math.random().toString(36).substring(7)}` } diff --git a/packages/ai/src/activities/generateVideo/index.ts b/packages/ai/src/activities/generateVideo/index.ts index cc942a5a2..643db32f0 100644 --- a/packages/ai/src/activities/generateVideo/index.ts +++ b/packages/ai/src/activities/generateVideo/index.ts @@ -67,6 +67,25 @@ export type VideoPromptForAdapter = : MediaPrompt : MediaPrompt +/** + * Extract the duration type for a VideoAdapter's model via ~types. + * Mirrors `VideoSizeForAdapter`. Falls back to `number` for adapters that + * haven't declared per-model duration constraints. + */ +export type VideoDurationForAdapter = + TAdapter extends VideoAdapter< + infer TModel, + any, + any, + any, + any, + infer TDurationMap + > + ? TModel extends keyof TDurationMap + ? TDurationMap[TModel] + : number + : number + // =========================== // Activity Options Types @@ -113,8 +132,13 @@ export type VideoCreateOptions< prompt: VideoPromptForAdapter /** Video size — format depends on the provider (e.g., "16:9", "1280x720") */ size?: VideoSizeForAdapter - /** Video duration in seconds */ - duration?: number + /** + * Video duration in seconds. Adapters that declare a per-model duration + * map narrow this to the model's valid union (e.g. `4 | 6 | 8` for Veo 3). + * Pass `adapter.snapDuration(seconds)` to coerce raw seconds to a valid + * value. + */ + duration?: VideoDurationForAdapter /** * Whether to stream the video generation lifecycle. * When true, returns an AsyncIterable that handles the full diff --git a/packages/ai/src/activities/generateVideo/snap.ts b/packages/ai/src/activities/generateVideo/snap.ts new file mode 100644 index 000000000..f779d32f0 --- /dev/null +++ b/packages/ai/src/activities/generateVideo/snap.ts @@ -0,0 +1,100 @@ +import type { DurationOptions } from './adapter' + +/** + * Extract a numeric seconds value from a `DurationOptions` entry. Returns + * `null` for entries that don't parse as a number — e.g. `'auto'`. + * + * Handles the keyword-with-unit form FAL uses for Luma/Veo (`'8s'`, `'9s'`) + * by stripping a trailing `s`. Pure-numeric strings (`'5'`, `'10'`) parse via + * Number(). Numbers pass through. + */ +function entryToSeconds(entry: string | number): number | null { + if (typeof entry === 'number') { + return Number.isFinite(entry) ? entry : null + } + const stripped = entry.endsWith('s') ? entry.slice(0, -1) : entry + const parsed = Number(stripped) + return Number.isFinite(parsed) ? parsed : null +} + +/** + * Snap a raw seconds value to the closest valid duration for a model's + * `DurationOptions`. + * + * - `none` → `undefined` + * - `discrete` → closest numeric-parseable entry; if none parse, + * returns `values[0]` (keyword-only models like 'auto') + * - `range` → clamped to [min, max] and rounded to `step` (default 1) + * - `mixed` → closest of (discrete numerics ∪ range values) + * + * @experimental Video generation is an experimental feature and may change. + */ +export function snapToDurationOption( + seconds: number, + options: DurationOptions, +): T | undefined { + switch (options.kind) { + case 'none': + return undefined + + case 'discrete': { + return pickClosestDiscrete(seconds, options.values) + } + + case 'range': { + const step = options.step ?? 1 + const clamped = Math.min(options.max, Math.max(options.min, seconds)) + const snapped = + Math.round((clamped - options.min) / step) * step + options.min + return Math.min(options.max, Math.max(options.min, snapped)) as T + } + + case 'mixed': { + const discreteCandidate = pickClosestDiscrete(seconds, options.values) + if (!options.range) return discreteCandidate + + const { min, max, step = 1 } = options.range + const clamped = Math.min(max, Math.max(min, seconds)) + const rangeValue = Math.min( + max, + Math.max(min, Math.round((clamped - min) / step) * step + min), + ) + + // Compare distance; range value is numeric, discrete may have non-numeric + // first-entry fallback (return distance Infinity for non-numerics). + const discreteSeconds = + typeof discreteCandidate === 'number' + ? discreteCandidate + : discreteCandidate !== undefined + ? (entryToSeconds(discreteCandidate) ?? Infinity) + : Infinity + + return Math.abs(discreteSeconds - seconds) <= + Math.abs(rangeValue - seconds) + ? discreteCandidate + : (rangeValue as T) + } + } +} + +function pickClosestDiscrete( + seconds: number, + values: ReadonlyArray, +): T | undefined { + if (values.length === 0) return undefined + + let best: T | undefined + let bestDistance = Infinity + for (const value of values) { + const v = entryToSeconds(value) + if (v === null) continue + const distance = Math.abs(v - seconds) + if (distance < bestDistance) { + bestDistance = distance + best = value + } + } + + // Keyword-only set (no numeric-parseable entries) — fall back to first entry. + return best ?? values[0] +} diff --git a/packages/ai/src/activities/index.ts b/packages/ai/src/activities/index.ts index 69d06be22..07fdbe73a 100644 --- a/packages/ai/src/activities/index.ts +++ b/packages/ai/src/activities/index.ts @@ -119,6 +119,7 @@ export { type VideoCreateOptions, type VideoStatusOptions, type VideoUrlOptions, + type VideoDurationForAdapter, } from './generateVideo/index' export { @@ -126,8 +127,11 @@ export { type VideoAdapter, type VideoAdapterConfig, type AnyVideoAdapter, + type DurationOptions, } from './generateVideo/adapter' +export { snapToDurationOption } from './generateVideo/snap' + // =========================== // TTS Activity // =========================== diff --git a/packages/ai/src/types.ts b/packages/ai/src/types.ts index 7ab506016..2bf82a803 100644 --- a/packages/ai/src/types.ts +++ b/packages/ai/src/types.ts @@ -1700,6 +1700,7 @@ export interface AudioGenerationResult { export interface VideoGenerationOptions< TProviderOptions extends object = object, TSize extends string | undefined = string, + TDuration extends string | number | undefined = number, > { /** The model to use for video generation */ model: string @@ -1714,8 +1715,12 @@ export interface VideoGenerationOptions< prompt: MediaPrompt /** Video size — format depends on the provider (e.g., "16:9", "1280x720") */ size?: TSize - /** Video duration in seconds */ - duration?: number + /** + * Video duration in seconds. Adapters that declare a per-model duration + * map narrow this to the model's valid union; use + * `adapter.snapDuration(seconds)` to coerce raw seconds to a valid value. + */ + duration?: TDuration /** Model-specific options for video generation */ modelOptions?: TProviderOptions /** diff --git a/packages/ai/tests/stream-generation.test.ts b/packages/ai/tests/stream-generation.test.ts index 155b10d65..2cb2741c3 100644 --- a/packages/ai/tests/stream-generation.test.ts +++ b/packages/ai/tests/stream-generation.test.ts @@ -170,6 +170,9 @@ describe('generateVideo({ stream: true })', () => { model: 'test-model', '~types': {} as any, + availableDurations: () => ({ kind: 'none' as const }), + snapDuration: () => undefined, + createVideoJob: vi.fn(async () => ({ jobId: 'job-123', model: 'test-model', diff --git a/testing/e2e/global-setup.ts b/testing/e2e/global-setup.ts index c1eed2859..7a858cf91 100644 --- a/testing/e2e/global-setup.ts +++ b/testing/e2e/global-setup.ts @@ -43,6 +43,13 @@ export default async function globalSetup() { mock.mount('/v1/text-to-speech', elevenLabsTTSMount()) mock.mount('/v1/speech-to-text', elevenLabsSTTMount()) + // Gemini Veo video generation. aimock 1.29 mocks Gemini's `:predict` + // (Imagen) endpoint but not the long-running `:predictLongRunning` + + // operations-polling pair Veo uses, so mount both here. Non-Veo paths + // under /v1beta/models (chat, images) return false and fall through to + // aimock's native Gemini handlers. + mock.mount('/v1beta/models', geminiVeoMount()) + // Anthropic server_tool_use bug reproduction (issue #604). aimock can't // natively synthesize `server_tool_use` / `web_fetch_tool_result` content // blocks, so this mount hand-crafts the raw SSE Claude would emit when a @@ -267,6 +274,67 @@ function elevenLabsSTTMount(): Mountable { } } +/** + * Mounts Gemini Veo's long-running video generation endpoints: + * + * - `POST /v1beta/models/{model}:predictLongRunning` — starts the job and + * returns the operation name. + * - `GET /v1beta/models/{model}/operations/{id}` — polls the operation. The + * mock completes immediately with the raw MLDev wire shape + * (`response.generateVideoResponse.generatedSamples[0].video.uri`), which + * the `@google/genai` SDK maps to `response.generatedVideos[0].video.uri`. + * + * Mirrors the openai `onVideo` fixture: same prompt-agnostic completed job, + * same target video URL. + */ +function geminiVeoMount(): Mountable { + const VIDEO_URL = 'https://example.com/guitar-store.mp4' + return { + async handleRequest( + req: http.IncomingMessage, + res: http.ServerResponse, + // aimock strips the mount prefix ('/v1beta/models') and any query + // string, so pathname looks like '/{model}:predictLongRunning' or + // '/{model}/operations/{id}'. + pathname: string, + ): Promise { + const createMatch = pathname.match(/^\/([^/:]+):predictLongRunning$/) + if (createMatch && req.method === 'POST') { + await drainBody(req) + res.statusCode = 200 + res.setHeader('Content-Type', 'application/json') + res.end( + JSON.stringify({ + name: `models/${createMatch[1]}/operations/veo-job-e2e`, + }), + ) + return true + } + + const pollMatch = pathname.match(/^\/([^/:]+)\/operations\/([^/]+)$/) + if (pollMatch && req.method === 'GET') { + res.statusCode = 200 + res.setHeader('Content-Type', 'application/json') + res.end( + JSON.stringify({ + name: `models/${pollMatch[1]}/operations/${pollMatch[2]}`, + done: true, + response: { + generateVideoResponse: { + generatedSamples: [{ video: { uri: VIDEO_URL } }], + }, + }, + }), + ) + return true + } + + // Not a Veo path — fall through to aimock's native Gemini handlers. + return false + }, + } +} + /** * Mounts a Claude-shaped SSE response that includes a client `tool_use` block * followed by a `web_fetch` `server_tool_use` block, plus its diff --git a/testing/e2e/src/lib/feature-support.ts b/testing/e2e/src/lib/feature-support.ts index b4e85a715..5e722de63 100644 --- a/testing/e2e/src/lib/feature-support.ts +++ b/testing/e2e/src/lib/feature-support.ts @@ -191,12 +191,17 @@ export const matrix: Record> = { 'sound-effects': new Set(['elevenlabs']), tts: new Set(['openai', 'grok', 'elevenlabs']), transcription: new Set(['openai', 'grok', 'elevenlabs']), - 'video-gen': new Set(['openai']), + // Gemini Veo runs through a custom aimock mount (see geminiVeoMount in + // global-setup.ts) — aimock 1.29 doesn't model the long-running + // `:predictLongRunning` + operations-polling pair natively. + 'video-gen': new Set(['openai', 'gemini']), // image-to-video (image parts in the generateVideo prompt). aimock 1.29's // `/v1/videos` handler parses Sora's multipart upload (the SDK switches to // multipart when `input_reference` carries a File) and matches on the // `prompt` form field, so the OpenAI/Sora route runs end-to-end. fal's - // endpoint-specific fields remain unit-test-only. + // endpoint-specific fields and Gemini Veo's image/lastFrame/referenceImages + // routing remain unit-test-only (the spec's journal assertion is tied to + // aimock's /v1/videos pipeline, which custom mounts bypass). 'image-to-video': new Set(['openai']), // Only Gemini currently surfaces a first-class stateful conversation API via // the adapter (geminiTextInteractions, behind @tanstack/ai-gemini/experimental). diff --git a/testing/e2e/src/lib/media-providers.ts b/testing/e2e/src/lib/media-providers.ts index d399eb33f..759e56527 100644 --- a/testing/e2e/src/lib/media-providers.ts +++ b/testing/e2e/src/lib/media-providers.ts @@ -4,7 +4,11 @@ import { createOpenaiTranscription, createOpenaiVideo, } from '@tanstack/ai-openai' -import { createGeminiAudio, createGeminiImage } from '@tanstack/ai-gemini' +import { + createGeminiAudio, + createGeminiImage, + createGeminiVideo, +} from '@tanstack/ai-gemini' import { createGrokImage, createGrokSpeech, @@ -129,6 +133,10 @@ export function createVideoAdapter( baseURL: openaiUrl(aimockPort), defaultHeaders: headers, }), + gemini: () => + createGeminiVideo('veo-3.1-generate-preview', DUMMY_KEY, { + httpOptions: { baseUrl: llmockBase(aimockPort), headers }, + }), } const factory = factories[provider] if (!factory) throw new Error(`No video adapter for provider: ${provider}`)