Skip to content
This repository was archived by the owner on May 29, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@
"qwen",
"RAAA",
"redteam",
"remarkdetails",
"resd",
"resj",
"resl",
Expand Down
4 changes: 2 additions & 2 deletions docs/src/content/docs/reference/runtime/plugin-mdast.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ const { parse, visit, stringify } = await mdast();
- parsing to mdast tree

```typescript
const root = await parse("# Hello World");
const root = parse("# Hello World");
```

- visiting the tree (see [documentation](https://unifiedjs.com/learn/recipe/tree-traversal/pnp))
Expand All @@ -61,7 +61,7 @@ You can use `inspect` to pretty print a MDAST tree.

```typescript
const { parse, inspect } = await mdast();
const root = await parse("# Hello World");
const root = parse("# Hello World");
console.log(inspect(root));
```

Expand Down
4 changes: 2 additions & 2 deletions packages/cli/src/parse.ts
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ export async function parsePDF(file: string, options: { images: boolean; out: st
* @param file - The path to the DOCX file to parse.
* @param options - Options for parsing the DOCX file.
*/
export async function parseDOCX(file: string, options: DocxParseOptions) {
export async function parseDOCX(file: string, options: DocxParseOptions): Promise<void> {
// Uses DOCXTryParse to extract text from the DOCX file
const res = await DOCXTryParse(file, options);
if (res.error) console.error(res.error);
Expand All @@ -119,7 +119,7 @@ export async function parseDOCX(file: string, options: DocxParseOptions) {
export async function parseHTMLToText(
fileOrUrl: string,
options: { format?: "markdown" | "text"; out?: string },
) {
): Promise<void> {
const { format = "markdown", out } = options || {};
const file: WorkspaceFile = { filename: fileOrUrl };
await resolveFileContent(file);
Expand Down
7 changes: 5 additions & 2 deletions packages/core/src/html.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@
// Licensed under the MIT License.

// This module provides functions to convert HTML content into different formats such as JSON, plain text, and Markdown.
// eslint-disable-next-line @typescript-eslint/triple-slash-reference
/// <reference path="./html-escaper.d.ts" />
// eslint-disable-next-line @typescript-eslint/triple-slash-reference
/// <reference path="./turndown-plugin-gfm.d.ts" />

import { CancellationOptions, checkCancelled } from "./cancellation.js";
import { TraceOptions } from "./trace.js"; // Import TraceOptions for optional logging features
import type { CancellationOptions } from "./cancellation.js";
import { checkCancelled } from "./cancellation.js";
import type { TraceOptions } from "./trace.js"; // Import TraceOptions for optional logging features
import type { HTMLToMarkdownOptions, HTMLToTextOptions } from "./types.js"; // Import HTMLToTextOptions for configuring HTML to text conversion

/**
Expand Down
1 change: 0 additions & 1 deletion packages/plugin-mdast/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
"remark-directive": "^4.0.0",
"remark-frontmatter": "^5.0.0",
"remark-gfm": "^4.0.1",
"remark-github": "^12.0.0",
"remark-math": "^6.0.0",
"remark-mdx": "^3.1.0",
"remark-parse": "^11.0.0",
Expand Down
2 changes: 1 addition & 1 deletion packages/plugin-mdast/src/remarkalerts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import type { Plugin } from "unified";
import type { Node, Root, Paragraph, Text, Blockquote, Data } from "mdast";
import { visit } from "unist-util-visit";
import { genaiscriptDebug } from "@genaiscript/core";
const dbg = genaiscriptDebug("mdast:gh-alerts");
const dbg = genaiscriptDebug("mdast:gfm:alerts");

/**
* GitHub alert types supported by the plugin
Expand Down
107 changes: 107 additions & 0 deletions packages/plugin-mdast/src/remarkdetails.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

import type { Plugin } from "unified";
import type { Node, Root, Paragraph, Text, Blockquote, Data, Parent, RootContent } from "mdast";
import { visit, SKIP } from "unist-util-visit";
import { remark } from "remark";
import { genaiscriptDebug } from "@genaiscript/core";
const dbg = genaiscriptDebug("mdast:html:details");

export interface RemarkDetailsOptions {}

export interface DetailsElement extends Parent {
type: "detailsElement";
attributes?: string;
data?: Data & {
detailsElement?: {
summary: string;
content: string;
};
};
}

export interface SummaryElement extends Parent {
type: "summaryElement";
data?: Data & {
summaryElement?: {
text: string;
};
};
}

const remarkDetails: Plugin<[RemarkDetailsOptions?], Root> = (options = {}) => {
return (tree) => {
visit(tree, "html", (node, index, parent) => {
// Regex to parse HTML details element with optional summary
const detailsRegex =
/^\s*<details(?<attributes>\s+[^>]*)?>(?:\s*<summary(?:\s+[^>]*)?>(?<summary>[^<]*)<\/summary>)?\s*(?<content>[\s\S]*?)\s*<\/details>\s*$/i;
const match = node.value.match(detailsRegex);
if (!match) return undefined;

dbg(`parsing %s`, node.value);
const { attributes, summary, content } = match.groups;
dbg(`summary: %s`, summary);
dbg(`content: %s`, content?.slice(0, 100));

// Parse content as markdown if it exists
let contentNodes: RootContent[] = [];
if (content) {
try {
const contentTree = remark().parse(content.trim());
contentNodes = contentTree?.children;
} catch (error) {
dbg(`failed to parse content as markdown: %s`, error);
// Fallback to text node
contentNodes = [
{
type: "text",
value: content,
} satisfies Text,
];
}
}

const summaryNode: SummaryElement | undefined = summary
? {
type: "summaryElement",
data: {
summaryElement: {
text: summary,
},
},
children: [
{
type: "text",
value: summary,
} as Text,
],
}
: undefined;

const detailsNode: DetailsElement = {
type: "detailsElement",
attributes,
data: {
detailsElement: {
summary: summary,
content: content,
},
},
children: [summaryNode, ...contentNodes].filter(Boolean) as RootContent[],
};

// Replace the HTML node with the details node
if (parent && typeof index === "number") {
// eslint-disable-next-line no-param-reassign
parent.children[index] = detailsNode as any;
return [SKIP, index];
}

dbg(`failed to replace node`);
return undefined;
});
};
};

export default remarkDetails;
24 changes: 12 additions & 12 deletions packages/plugin-mdast/src/unified.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import { checkRuntime, filenameOrFileToContent, genaiscriptDebug } from "@genais
import type { Processor } from "unified";
import remarkGitHubAlerts from "./remarkalerts.js";
import type { GitHubAlertMarker } from "./remarkalerts.js";
import remarkDetails, { DetailsElement, SummaryElement } from "./remarkdetails.js";
const dbg = genaiscriptDebug("mdast");

export interface MdAstOptions {
Expand All @@ -15,11 +16,6 @@ export interface MdAstOptions {
*/
gfm?: boolean;

/**
* GitHub short links. Default is true.
*/
github?: boolean;

/**
* Generic directive support. Default is true.
*/
Expand All @@ -46,7 +42,6 @@ export async function mdast(options?: MdAstOptions) {
const { inspect } = await import("unist-util-inspect");
const { default: directive } = await import("remark-directive");
const { default: gfm } = await import("remark-gfm");
const { default: github } = await import("remark-github");
const { default: frontmatter } = await import("remark-frontmatter");
const { default: math } = await import("remark-math");
const { default: mdx } = await import("remark-mdx");
Expand All @@ -56,7 +51,7 @@ export async function mdast(options?: MdAstOptions) {
const { visitParents } = await import("unist-util-visit-parents");
await import("mdast-util-mdxjs-esm");

const mdastParse = async (file: string | WorkspaceFile): Promise<Root> => {
const mdastParse = (file: string | WorkspaceFile): Root => {
const content = filenameOrFileToContent(file);
if (!content) return { type: "root", children: [] };

Expand All @@ -65,21 +60,28 @@ export async function mdast(options?: MdAstOptions) {
const processor = unified().use(parse);
usePlugins(processor, "parse");
const ast = processor.parse(content);
const processed = await processor.run(ast);
const processed = processor.runSync(ast);
return processed as Root;
};

const mdastStringify = (root: Root): string => {
const mdastStringify = (root: Root, options?: {}): string => {
if (!root) return "";

dbg(`stringify`);
const processor = unified();
usePlugins(processor, "stringify");
processor.use(stringify, {
...(options || {}),
handlers: {
githubAlertMarker(node: GitHubAlertMarker) {
return node.value;
},
detailsElement(node: DetailsElement) {
return `<details ${node.attributes || ""}>${node.children.map((child) => processor.stringify(child)).join("")}</details>`;
},
summaryElement(node: SummaryElement) {
return `<summary>${node.children.map((child) => processor.stringify(child)).join("")}</summary>`;
},
},
} as any);

Expand All @@ -100,13 +102,11 @@ export async function mdast(options?: MdAstOptions) {

function usePlugins(p: Processor<Root>, phase: "parse" | "stringify"): void {
p.use(frontmatter);
p.use(remarkDetails);
if (_options.gfm !== false) {
p.use(remarkGitHubAlerts);
p.use(gfm);
}
if (_options.github !== false && phase === "stringify") {
p.use(github);
}
if (_options.directive !== false) p.use(directive);
if (_options.math !== false) p.use(math);
// no comments in MDX files
Expand Down
Loading
Loading