diff --git a/KernelMemory.sln b/KernelMemory.sln
index 9fb0e7c2d..ecbd45833 100644
--- a/KernelMemory.sln
+++ b/KernelMemory.sln
@@ -234,6 +234,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = ".github", ".github", "{B897
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "206-dotnet-configuration-and-logging", "examples\206-dotnet-configuration-and-logging\206-dotnet-configuration-and-logging.csproj", "{D7D8FCA0-BFC6-4B0E-9885-64C2EA90AA62}"
EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "207-dotnet-expanding-chunks-on-retrieval", "examples\207-dotnet-expanding-chunks-on-retrieval\207-dotnet-expanding-chunks-on-retrieval.csproj", "{47CEEA8F-7858-4635-B902-4C704CF55EA0}"
+EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@@ -299,6 +301,7 @@ Global
{9F564F2D-EADD-47DE-9293-92B3E9CFFE36} = {3C17F42B-CFC8-4900-8CFB-88936311E919}
{B8976338-7CDC-47AE-8502-C2FBAFBEBD68} = {6EF76FD8-4C35-4370-8539-5DDF45357A50}
{D7D8FCA0-BFC6-4B0E-9885-64C2EA90AA62} = {0A43C65C-6007-4BB4-B3FE-8D439FC91841}
+ {47CEEA8F-7858-4635-B902-4C704CF55EA0} = {0A43C65C-6007-4BB4-B3FE-8D439FC91841}
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{8A9FA587-7EBA-4D43-BE47-38D798B1C74C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
@@ -472,5 +475,8 @@ Global
{D7D8FCA0-BFC6-4B0E-9885-64C2EA90AA62}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{D7D8FCA0-BFC6-4B0E-9885-64C2EA90AA62}.Debug|Any CPU.Build.0 = Debug|Any CPU
{D7D8FCA0-BFC6-4B0E-9885-64C2EA90AA62}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {47CEEA8F-7858-4635-B902-4C704CF55EA0}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {47CEEA8F-7858-4635-B902-4C704CF55EA0}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {47CEEA8F-7858-4635-B902-4C704CF55EA0}.Release|Any CPU.ActiveCfg = Release|Any CPU
EndGlobalSection
EndGlobal
diff --git a/examples/003-dotnet-Serverless/Program.cs b/examples/003-dotnet-Serverless/Program.cs
index 79d636cc0..9c805ae01 100644
--- a/examples/003-dotnet-Serverless/Program.cs
+++ b/examples/003-dotnet-Serverless/Program.cs
@@ -31,7 +31,7 @@
var memory = new KernelMemoryBuilder()
// .WithOpenAIDefaults(Env.Var("OPENAI_API_KEY"))
- // .WithOpenAI(openAICfg)
+ // .WithOpenAI(openAIConfig)
// .WithLlamaTextGeneration(llamaConfig)
.WithAzureOpenAITextGeneration(azureOpenAITextConfig, new DefaultGPTTokenizer())
.WithAzureOpenAITextEmbeddingGeneration(azureOpenAIEmbeddingConfig, new DefaultGPTTokenizer())
diff --git a/examples/206-dotnet-configuration-and-logging/Program.cs b/examples/206-dotnet-configuration-and-logging/Program.cs
index bf1bdc338..63778f0a6 100644
--- a/examples/206-dotnet-configuration-and-logging/Program.cs
+++ b/examples/206-dotnet-configuration-and-logging/Program.cs
@@ -4,7 +4,7 @@
public static class Program
{
- // ReSharper disable InconsistentNaming
+ // ReSharper disable once InconsistentNaming
public static async Task Main()
{
var openAIConfig = new OpenAIConfig
diff --git a/examples/207-dotnet-expanding-chunks-on-retrieval/207-dotnet-expanding-chunks-on-retrieval.csproj b/examples/207-dotnet-expanding-chunks-on-retrieval/207-dotnet-expanding-chunks-on-retrieval.csproj
new file mode 100644
index 000000000..232814102
--- /dev/null
+++ b/examples/207-dotnet-expanding-chunks-on-retrieval/207-dotnet-expanding-chunks-on-retrieval.csproj
@@ -0,0 +1,26 @@
+
+
+
+ Exe
+ net6.0
+ enable
+
+
+ false
+ 5ee045b0-aea3-4f08-8d31-32d1a6f8fed0
+ $(NoWarn);CA1050;CA2000;CA1707;CA1303;CA2007;CA1724;CA1861;
+
+
+
+
+
+
+
+
+
+
+ Always
+
+
+
+
diff --git a/examples/207-dotnet-expanding-chunks-on-retrieval/Program.cs b/examples/207-dotnet-expanding-chunks-on-retrieval/Program.cs
new file mode 100644
index 000000000..c6c332ba5
--- /dev/null
+++ b/examples/207-dotnet-expanding-chunks-on-retrieval/Program.cs
@@ -0,0 +1,194 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using Microsoft.KernelMemory;
+using Microsoft.KernelMemory.AI.OpenAI;
+using Microsoft.KernelMemory.Configuration;
+using Microsoft.KernelMemory.ContentStorage.DevTools;
+using Microsoft.KernelMemory.FileSystem.DevTools;
+using Microsoft.KernelMemory.MemoryStorage.DevTools;
+
+///
+/// This example shows how to retrieve N memory records before and after a relevant memory record.
+///
+/// Suppose uploading a book, and during the import KM splits the text in 10,000 partitions of 100 tokens, generating 10,000 memory records.
+/// When searching memory by similarity, the system returns a list of relevant memories, containing snippets of text ~100 tokens long.
+///
+/// Before sending text snippets to a LLM along with a question (RAG), you might want to include extra information, e.g. text PRECEDING and FOLLOWING each text snippet, e.g. 100 tokens extra on both sides:
+///
+/// ----------
+/// partition N - 1, memory record
+/// text snippet
+/// 100 tokens
+/// ----------
+/// partition N, RELEVANT memory record
+/// text snippet
+/// 100 tokens
+/// ----------
+/// partition N + 1, memory record
+/// text snippet
+/// 100 tokens
+/// ---------
+///
+/// The code below shows how to fetch records before and after each RELEVANT memory record, leveraging the Partition Number property.
+///
+/// Note: when importing documents, you can set `OverlappingTokens` so that each partition contains a part of the previous and the next partitions.
+/// This is another approach to always include a little more context, however this approach is limited by the max number of tokens an
+/// embedding generator can work with, and in a way affects the semantics of each text snippet.
+/// Also, when using the example below, you should consider setting OverlappingTokens to zero, to avoid text repetitions.
+///
+public static class Program
+{
+ // ReSharper disable once InconsistentNaming
+ public static async Task Main()
+ {
+ // Partition input text in chunks of 100 tokens
+ const int PartitionSize = 100;
+
+ // Some sample long content
+ string story = await File.ReadAllTextAsync("story.txt");
+ const string Query = "astrobiology";
+ const float MinRelevance = 0.7f;
+ const int Limit = 2;
+
+ // Print the content size in tokens
+ var tokenCount = DefaultGPTTokenizer.StaticCountTokens(story);
+ Console.WriteLine($"Token count: {tokenCount}");
+
+ // Load OpenAI settings and API key
+ var openAIConfig = new OpenAIConfig();
+ new ConfigurationBuilder()
+ .AddJsonFile("appsettings.json")
+ .AddJsonFile("appsettings.Development.json", optional: true)
+ .Build()
+ .BindSection("KernelMemory:Services:OpenAI", openAIConfig);
+
+ // Customize memory records size (in tokens)
+ var textPartitioningOptions = new TextPartitioningOptions
+ {
+ MaxTokensPerParagraph = PartitionSize,
+ MaxTokensPerLine = PartitionSize,
+ OverlappingTokens = 0,
+ };
+
+ // Prepare memory instance, store memories on disk so import runs only once
+ var memory = new KernelMemoryBuilder()
+ .WithOpenAI(openAIConfig)
+ .WithCustomTextPartitioningOptions(textPartitioningOptions)
+ .WithSimpleFileStorage(new SimpleFileStorageConfig { StorageType = FileSystemTypes.Disk })
+ .WithSimpleVectorDb(new SimpleVectorDbConfig { StorageType = FileSystemTypes.Disk })
+ .Build();
+
+ // Load text into memory
+ Console.WriteLine("Importing memories...");
+ await memory.ImportTextAsync(story, documentId: "example207");
+
+ // Search
+ Console.WriteLine("Searching memories...");
+ SearchResult relevant = await memory.SearchAsync(query: Query, minRelevance: MinRelevance, limit: Limit);
+ Console.WriteLine($"Relevant documents: {relevant.Results.Count}");
+
+#if KernelMemoryDev
+ var relevantDocuments = new Dictionary>();
+ foreach (Citation result in relevant.Results)
+ {
+ // Store the document IDs so we can load all their records later
+ relevantDocuments.Add(result.DocumentId, new List());
+ Console.WriteLine($"Document ID: {result.DocumentId}");
+ Console.WriteLine($"Relevant partitions: {result.Partitions.Count}");
+ foreach (Citation.Partition partition in result.Partitions)
+ {
+ Console.WriteLine("--------------------------");
+ Console.WriteLine($"Partition number: {partition.PartitionNumber}");
+ Console.WriteLine($"Relevance: {partition.Relevance}\n");
+ Console.WriteLine(partition.Text);
+
+ relevantDocuments[result.DocumentId].Add(partition.PartitionNumber);
+ }
+
+ Console.WriteLine();
+ }
+
+ // For each relevant document
+ // Note: loops can be optimized for better perf, this code is only a demo
+ const int HowManyToAdd = 1;
+ Console.WriteLine("Fetching all document partitions...");
+ foreach (KeyValuePair> relevantPartitionNumbers in relevantDocuments)
+ {
+ var docId = relevantPartitionNumbers.Key;
+ Console.WriteLine($"\nDocument ID: {docId}");
+
+ // Load all partitions. Note: the list might be out of order.
+ SearchResult all = await memory.SearchAsync("", filters: new[] { MemoryFilters.ByDocument(docId) }, limit: int.MaxValue);
+ List allPartitionsContent = all.Results.FirstOrDefault()?.Partitions ?? new();
+
+ // Loop through the relevant partitions
+ foreach (int relevantPartitionNumber in relevantPartitionNumbers.Value)
+ {
+ Console.WriteLine("--------------------------");
+
+ // Use a data structure to order partitions by number
+ var result = new SortedDictionary();
+
+ // Loop all partitions, include before and after the relevant ones
+ foreach (Citation.Partition p in allPartitionsContent)
+ {
+ if (Math.Abs(p.PartitionNumber - relevantPartitionNumber) <= HowManyToAdd)
+ {
+ result.Add(p.PartitionNumber, p.Text);
+ }
+ }
+
+ // Show partition and adjacent ones in order
+ foreach (var p in result)
+ {
+ Console.WriteLine($"Partition: {p.Key}");
+ Console.WriteLine(p.Value);
+ }
+
+ Console.WriteLine();
+ }
+ }
+#endif
+ }
+}
+
+/* Result:
+
+Token count: 2510
+Importing memories...
+Searching memories...
+Relevant documents: 1
+Document ID: example207
+Relevant partitions: 2
+--------------------------
+Partition number: 27
+Relevance: 0.8557962
+
+As scientific interest in [...] or ancient microbial life.
+--------------------------
+Partition number: 13
+Relevance: 0.85513425
+
+Gerald Marshall, the Chief [...] in astrobiological research."
+
+Fetching all document partitions...
+
+Document ID: example207
+--------------------------
+Partition: 26
+Dr. Mei Lin, a renowned [...] of life in the universe."
+Partition: 27
+As scientific interest [...] ancient microbial life.
+Partition: 28
+Meanwhile, back on Earth, [...] meaning in the universe.
+
+--------------------------
+Partition: 12
+Appearing as a glowing, [...] including its high CO2 levels.
+Partition: 13
+Gerald Marshall, the [...] in astrobiological research."
+Partition: 14
+While further studies [...] alien at the same time.
+
+*/
+
diff --git a/examples/207-dotnet-expanding-chunks-on-retrieval/appsettings.json b/examples/207-dotnet-expanding-chunks-on-retrieval/appsettings.json
new file mode 100644
index 000000000..88d693fba
--- /dev/null
+++ b/examples/207-dotnet-expanding-chunks-on-retrieval/appsettings.json
@@ -0,0 +1,36 @@
+{
+ "Logging": {
+ "LogLevel": {
+ "Default": "Warning",
+ // Examples: how to handle logs differently by class
+ // "Microsoft.KernelMemory.Handlers.TextExtractionHandler": "Information",
+ // "Microsoft.KernelMemory.Handlers.TextPartitioningHandler": "Information",
+ // "Microsoft.KernelMemory.Handlers.GenerateEmbeddingsHandler": "Information",
+ // "Microsoft.KernelMemory.Handlers.SaveEmbeddingsHandler": "Information",
+ // "Microsoft.KernelMemory.ContentStorage.AzureBlobs": "Information",
+ // "Microsoft.KernelMemory.Pipeline.Queue.AzureQueues": "Information",
+ "Microsoft.AspNetCore": "Warning"
+ }
+ },
+ "KernelMemory": {
+ "Services": {
+ "OpenAI": {
+ // Name of the model used to generate text (text completion or chat completion)
+ "TextModel": "gpt-3.5-turbo-16k",
+ // The max number of tokens supported by the text model.
+ "TextModelMaxTokenTotal": 16384,
+ // Name of the model used to generate text embeddings
+ "EmbeddingModel": "text-embedding-ada-002",
+ // The max number of tokens supported by the embedding model
+ // See https://platform.openai.com/docs/guides/embeddings/what-are-embeddings
+ "EmbeddingModelMaxTokenTotal": 8191,
+ // OpenAI API Key
+ "APIKey": "",
+ // OpenAI Organization ID (usually empty, unless you have multiple accounts on different orgs)
+ "OrgId": "",
+ // How many times to retry in case of throttling
+ "MaxRetries": 10
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/examples/207-dotnet-expanding-chunks-on-retrieval/story.txt b/examples/207-dotnet-expanding-chunks-on-retrieval/story.txt
new file mode 100644
index 000000000..ff34bef2a
--- /dev/null
+++ b/examples/207-dotnet-expanding-chunks-on-retrieval/story.txt
@@ -0,0 +1,38 @@
+A strange and surprising event transpired upon the celestial manmade globe - the International Space Station. A vegetable of the red fruit variety, otherwise known on our terrestrial land as a 'tomato', was cultivated with the remarkable method of hydroponics, defying the hitherto believed necessity of soil for growth, and subsequently misplaced by the American Voyager, Mr. Frank Rubio.
+As trivial as it may seem, the plantation of this tomato held great significance, being the inaugural produce of its kind to flourish in the challenging conditions of the cosmos, and its inexplicable disappearance made for a comical investigation of sorts. Mr. Rubio, convinced of its safekeeping, found the prize fruit astray and upon his return to the Earth, the bewitching mystery of the vanishing tomato persisted.
+Much to the disquiet of Rubio, accusations of him having consumed the invaluable specimen disquieted the floating abode. He vehemently refuted the charges, attributing the disappearance to the curious character of the conditions in space, where objects not securely affixed could easily drift into unforeseen corners of the spacious station. Despite his rigorous search, the tomato evaded discovery.
+This incident of mirth, notwithstanding, Mr. Rubio's sojourn in space did not stay deprived of notable triumph. His stay in this amidst the heavenly spheres reached a duration hitherto unknown to any American voyager, marking a full Earth-year in space. Rendered longer owing to an unfortunate leak detected in his Russian Soyuz spacecraft, it proved to be a challenging, yet rewarding journey for Rubio.
+A resolution to the tale of the missing tomato finally came not during Mr. Rubio's stay, but with the revelation of the crew remaining in the station of the discovery of the missing specimen. Thus, even after returning to the terrestrial sphere, the voyager's innocence was ultimately affirmed, adding a closing chapter to this historical space oddity.
+Alas, despite the humour this event bequeathed, the great strides made in the science of celestial agriculture cannot be understated. The successful cultivation of a tomato under such harsh conditions bodes well for future endeavours of similar nature, serving as a promising beacon of mankind's progress against the unique challenges that space exploration poses.
+Id est, Rubio's 'lost in space' tomato sparks a shift from jest to marvel, creating a newfound appreciation for the advancements in scientific know-how, that led to the cultivation, and eventual rediscovery of a humble fruit in space.
+Mindful of the peculiar incident, the space administration contrived to install advanced object-tracking systems within the Space Station to avoid recurrent miscellany loss. A new regimen was also introduced to ensure that harvested produce was promptly accounted for and preserved, preventing any further produce-related mysteries.
+Simultaneously, this whimsical incident spurred a new stream of scientific study centered around the longevity and preservation of biotic material in a microgravity environment. Scientists discovered that the space-cultivated tomato, despite its desiccated state, presented unique characteristics not found in its Earth-grown counterparts.
+Detailed analysis revealed heightened concentrations of lycopene in the space-grown tomato, a potent antioxidant known for its numerous health benefits including reducing the risk of heart diseases and cancer. It was debated whether these enhanced features were a byproduct of the tomato's prolonged exposure to cosmic radiation or the unique hydroponic growth methodology adopted on the space station.
+Additionally, the longevity of the tomato in an un-refrigerated state sparked interest in bio-engineering crops for greater longevity on Earth, with potential implications for reducing food waste. The space life of the tomato, in all its humour and seriousness, may mark the beginning of far-reaching advancements in botanical sciences and space exploration.
+In a surprising twist to the tale, around the time the elusive tomato was found, the crew on the space station also stumbled upon something extraordinary — an unidentified substance found growing alongside the microgravity tomatoes. Initially thought to be a mold or fungus, subsequent analysis revealed an organic composition unlike anything known to Earth-bound biology.
+Appearing as a glowing, translucent mold, this substance showed a remarkable rate of growth and exhibited photosynthetic properties, drawing energy not just from sunlight, but also from other forms of radiation. It was able to adapt quickly to the environmental conditions of the space station, including its high CO2 levels.
+Gerald Marshall, the Chief Scientist on the team at NASA, said during a press briefing, "Our initial findings lead us to believe the matter is not terrestrial. Its unprecedented radiant energy conversion efficiency and adaptability are akin to, but far exceed, those seen in extremophile organisms on Earth. We are eager to undertake a comprehensive study and certainly, this could potentially mark a new chapter in astrobiological research."
+While further studies are underway, this intriguing finding sparked a flurry of interest and speculation within and outside the scientific community. This new organic matter, playfully named ‘Rubio's Radiant Mold’ in honor of astronaut Frank Rubio, could potentially reshape our understanding of life in the cosmos and further blur the lines between science fiction and reality. With each passing day, the 'final frontier' appears to become more familiar and intriguingly alien at the same time.
+As the scientific community buzzed with excitement over the discovery of Rubio's Radiant Mold, teams of researchers from around the world clamored to analyze the enigmatic substance. In laboratories equipped with state-of-the-art equipment, scientists meticulously studied the peculiar organism, hoping to unlock its secrets and unravel the mysteries of its origins.
+Initial analyses revealed startling findings. The organic composition of Rubio's Radiant Mold was unlike anything previously encountered on Earth. Its cellular structure exhibited unique adaptations to the harsh environment of space, including the ability to absorb and utilize various forms of radiation for energy production.
+Dr. Elena Vasquez, a leading astrobiologist at the European Space Agency, was among the scientists at the forefront of the research efforts. Her team conducted extensive genomic sequencing and comparative analysis to decipher the genetic code of the mysterious organism.
+"We were astounded by what we found," Dr. Vasquez remarked during a symposium on astrobiology. "The genetic makeup of Rubio's Radiant Mold suggests that it may have originated from beyond our solar system. Its DNA contains sequences that bear no resemblance to any known terrestrial life forms, indicating a possible extraterrestrial origin."
+The implications of this discovery were profound. If confirmed, Rubio's Radiant Mold would represent the first concrete evidence of alien life. The prospect of studying a living organism from another world ignited the imaginations of scientists and laypeople alike, sparking renewed interest in the search for life beyond Earth.
+As research into the organism progressed, additional revelations came to light. Analysis of the mold's metabolism revealed its remarkable efficiency in converting solar and cosmic radiation into usable energy. This capability raised intriguing possibilities for the development of advanced bioenergetic technologies and renewable energy sources.
+Furthermore, studies on Rubio's Radiant Mold uncovered its extraordinary resilience to environmental extremes. The organism demonstrated remarkable adaptability, thriving in conditions of microgravity, high radiation, and fluctuating atmospheric composition. These traits hinted at the potential for bioengineering breakthroughs in fields ranging from agriculture to space exploration.
+Amidst the scientific fervor surrounding the discovery, ethical questions arose regarding the handling and containment of Rubio's Radiant Mold. Concerns were raised about the potential risks posed by introducing an alien organism into Earth's biosphere, prompting calls for stringent quarantine protocols and risk assessment measures.
+Despite the challenges and uncertainties, the exploration of Rubio's Radiant Mold represented a watershed moment in humanity's quest to understand the cosmos and our place within it. With each revelation and breakthrough, the boundaries of knowledge expanded, pushing the frontiers of science and exploration ever further.
+As researchers delved deeper into the mysteries of the universe, one thing became abundantly clear: the discovery of Rubio's Radiant Mold was just the beginning of a new chapter in humanity's journey to unlock the secrets of the cosmos and explore the vast expanse of space. And with each step forward, the universe revealed itself to be more wondrous and awe-inspiring than we could have ever imagined.
+Amidst the excitement surrounding the study of Rubio's Radiant Mold, another unexpected development emerged from the depths of space. An unmanned probe sent to investigate a distant asteroid returned with astonishing data: traces of organic compounds previously unseen in the cosmos.
+Named the "Voyager Anomaly" after the famous American spacecraft, this discovery captivated astronomers and astrobiologists around the globe. The composition of these compounds defied conventional understanding, hinting at the possibility of extraterrestrial life existing beyond the confines of Earth.
+Dr. Mei Lin, a renowned astrophysicist at the forefront of the research, remarked, "The implications of the Voyager Anomaly are profound. For the first time, we have tangible evidence that the building blocks of life may not be unique to Earth. This opens up new avenues for exploration and challenges our preconceptions about the prevalence of life in the universe."
+As scientific interest in the anomaly intensified, international space agencies collaborated on ambitious missions to further investigate the asteroid belt and other celestial bodies. Cutting-edge spacecraft equipped with advanced instrumentation were dispatched to analyze the composition of asteroids and comets, searching for signs of biological activity or ancient microbial life.
+Meanwhile, back on Earth, the discovery of the Voyager Anomaly sparked a cultural and philosophical renaissance. Debates raged in academic circles and public forums about humanity's place in the cosmos and the potential implications of encountering extraterrestrial life. Religious leaders offered diverse interpretations of how such revelations might intersect with spiritual beliefs, sparking a global dialogue about the intersection of science, faith, and the search for meaning in the universe.
+In the midst of this intellectual ferment, a new generation of space explorers emerged, inspired by the tantalizing prospect of discovering life beyond Earth. Governments and private companies redoubled their efforts to develop space exploration technologies, envisioning a future where humanity ventures boldly into the unknown reaches of the cosmos.
+As humanity stood on the cusp of a new era of discovery, the legacy of Rubio's Radiant Mold and the enigmatic Voyager Anomaly served as potent reminders of the boundless potential of human curiosity and ingenuity. With each passing day, the stars beckoned with ever-greater allure, inviting us to venture forth and unravel the mysteries of the universe.
+As humanity's gaze turned towards the stars, the implications of the Voyager Anomaly and Rubio's Radiant Mold reverberated far beyond the confines of Earth's atmosphere. The newfound understanding of extraterrestrial organic compounds sparked renewed interest in lunar exploration and satellite observation.
+Moon missions, once relegated to the annals of history, experienced a resurgence as space agencies and private enterprises raced to establish permanent bases on the lunar surface. The discovery of potentially life-supporting compounds on asteroids reignited speculation about the moon's potential as a staging ground for further space exploration, prompting ambitious plans for manned missions and scientific research stations.
+Meanwhile, Earth's network of satellites underwent a transformative evolution as scientists and engineers reevaluated their role in light of the newfound understanding of cosmic phenomena. Satellite missions were recalibrated to search for signs of extraterrestrial life, with sensors and detectors repurposed to detect traces of organic compounds and analyze atmospheric compositions of distant celestial bodies.
+Furthermore, the discovery of Rubio's Radiant Mold spurred a renaissance in astrobiological research, with satellites equipped with specialized instruments deployed to study microbial life in extreme environments on Earth. From the depths of the ocean to the heights of the atmosphere, satellites surveyed the planet's ecosystems with unprecedented detail, shedding light on the diversity and resilience of life in even the most inhospitable environments.
+At the same time, concerns about the potential contamination of Earth's biosphere by extraterrestrial organisms led to the development of stringent quarantine protocols and containment measures for spacecraft returning from space missions. Satellite observation networks were expanded to monitor for any signs of biological contamination, ensuring the protection of terrestrial ecosystems from potentially harmful pathogens.
+As humanity ventured further into the cosmos, the boundaries between Earth and space blurred, giving rise to a new era of interconnectedness and exploration. The legacy of Rubio's Radiant Mold and the Voyager Anomaly served as catalysts for this transformative journey, inspiring future generations to reach for the stars and unlock the mysteries of the universe.
diff --git a/service/Abstractions/Constants.cs b/service/Abstractions/Constants.cs
index 2fb236eb8..a15453a58 100644
--- a/service/Abstractions/Constants.cs
+++ b/service/Abstractions/Constants.cs
@@ -33,6 +33,8 @@ public static class Constants
public const string ReservedDocumentIdTag = $"{ReservedTagsPrefix}document_id";
public const string ReservedFileIdTag = $"{ReservedTagsPrefix}file_id";
public const string ReservedFilePartitionTag = $"{ReservedTagsPrefix}file_part";
+ public const string ReservedFilePartitionNumberTag = $"{ReservedTagsPrefix}part_n";
+ public const string ReservedFileSectionNumberTag = $"{ReservedTagsPrefix}sect_n";
public const string ReservedFileTypeTag = $"{ReservedTagsPrefix}file_type";
public const string ReservedSyntheticTypeTag = $"{ReservedTagsPrefix}synth";
diff --git a/service/Abstractions/Models/Citation.cs b/service/Abstractions/Models/Citation.cs
index 4e8761429..f77390e70 100644
--- a/service/Abstractions/Models/Citation.cs
+++ b/service/Abstractions/Models/Citation.cs
@@ -86,6 +86,20 @@ public class Partition
[JsonPropertyOrder(2)]
public float Relevance { get; set; } = 0;
+ ///
+ /// Partition number, zero based
+ ///
+ [JsonPropertyName("partitionNumber")]
+ [JsonPropertyOrder(3)]
+ public int PartitionNumber { get; set; } = 0;
+
+ ///
+ /// Text page number / Audio segment number / Video scene number
+ ///
+ [JsonPropertyName("sectionNumber")]
+ [JsonPropertyOrder(4)]
+ public int SectionNumber { get; set; } = 0;
+
///
/// Timestamp about the file/text partition.
///
diff --git a/service/Abstractions/Pipeline/DataPipeline.cs b/service/Abstractions/Pipeline/DataPipeline.cs
index cfaf7af0b..dfa7dadad 100644
--- a/service/Abstractions/Pipeline/DataPipeline.cs
+++ b/service/Abstractions/Pipeline/DataPipeline.cs
@@ -86,9 +86,23 @@ public abstract class FileDetailsBase
public ArtifactTypes ArtifactType { get; set; } = ArtifactTypes.Undefined;
///
- /// File tags. Note, the data structure allows file tags to differ from the document tags.
+ /// If the file is a partition, which partition number in the list of partitions extracted from a file.
///
[JsonPropertyOrder(5)]
+ [JsonPropertyName("partition_number")]
+ public int PartitionNumber { get; set; } = 0;
+
+ ///
+ /// If the file is a partition, from which document page/audio segment/video scene is it from.
+ ///
+ [JsonPropertyOrder(6)]
+ [JsonPropertyName("section_number")]
+ public int SectionNumber { get; set; } = 0;
+
+ ///
+ /// File tags. Note, the data structure allows file tags to differ from the document tags.
+ ///
+ [JsonPropertyOrder(7)]
[JsonPropertyName("tags")]
public TagCollection Tags { get; set; } = new();
diff --git a/service/Core/Handlers/GenerateEmbeddingsHandler.cs b/service/Core/Handlers/GenerateEmbeddingsHandler.cs
index 3c9106393..0351c8b08 100644
--- a/service/Core/Handlers/GenerateEmbeddingsHandler.cs
+++ b/service/Core/Handlers/GenerateEmbeddingsHandler.cs
@@ -73,6 +73,7 @@ public GenerateEmbeddingsHandler(
this._log.LogDebug("Generating embeddings, pipeline '{0}/{1}'", pipeline.Index, pipeline.DocumentId);
+ var partitionsFound = false;
foreach (var uploadedFile in pipeline.Files)
{
// Track new files being generated (cannot edit originalFile.GeneratedFiles while looping it)
@@ -83,6 +84,7 @@ public GenerateEmbeddingsHandler(
var partitionFile = generatedFile.Value;
if (partitionFile.AlreadyProcessedBy(this))
{
+ partitionsFound = true;
this._log.LogTrace("File {0} already processed by this handler", partitionFile.Name);
continue;
}
@@ -95,6 +97,8 @@ public GenerateEmbeddingsHandler(
continue;
}
+ partitionsFound = true;
+
// TODO: cost/perf: if the partition SHA256 is the same and the embedding exists, avoid generating it again
switch (partitionFile.MimeType)
{
@@ -153,6 +157,10 @@ public GenerateEmbeddingsHandler(
Size = text.Length,
MimeType = MimeTypes.TextEmbeddingVector,
ArtifactType = DataPipeline.ArtifactTypes.TextEmbeddingVector,
+#if KernelMemoryDev
+ PartitionNumber = partitionFile.PartitionNumber,
+ SectionNumber = partitionFile.SectionNumber,
+#endif
Tags = partitionFile.Tags,
};
embeddingFileNameDetails.MarkProcessedBy(this);
@@ -176,6 +184,11 @@ public GenerateEmbeddingsHandler(
}
}
+ if (!partitionsFound)
+ {
+ this._log.LogWarning("Pipeline '{0}/{1}': text partitions not found, cannot generate embeddings, moving to next pipeline step.", pipeline.Index, pipeline.DocumentId);
+ }
+
return (true, pipeline);
}
diff --git a/service/Core/Handlers/SaveRecordsHandler.cs b/service/Core/Handlers/SaveRecordsHandler.cs
index 00f590981..6c3874636 100644
--- a/service/Core/Handlers/SaveRecordsHandler.cs
+++ b/service/Core/Handlers/SaveRecordsHandler.cs
@@ -100,37 +100,50 @@ public SaveRecordsHandler(
public async Task<(bool success, DataPipeline updatedPipeline)> SaveEmbeddingsAsync(
DataPipeline pipeline, CancellationToken cancellationToken = default)
{
+ var embeddingsFound = false;
+
// For each embedding file => For each Memory DB => Upsert record
- foreach (FileDetailsWithRecordId file in GetListOfEmbeddingFiles(pipeline))
+ foreach (FileDetailsWithRecordId embeddingFile in GetListOfEmbeddingFiles(pipeline))
{
- if (file.File.AlreadyProcessedBy(this))
+ if (embeddingFile.File.AlreadyProcessedBy(this))
{
- this._log.LogTrace("File {0} already processed by this handler", file.File.Name);
+ embeddingsFound = true;
+ this._log.LogTrace("File {0} already processed by this handler", embeddingFile.File.Name);
continue;
}
- string vectorJson = await this._orchestrator.ReadTextFileAsync(pipeline, file.File.Name, cancellationToken).ConfigureAwait(false);
+ string vectorJson = await this._orchestrator.ReadTextFileAsync(pipeline, embeddingFile.File.Name, cancellationToken).ConfigureAwait(false);
EmbeddingFileContent? embeddingData = JsonSerializer.Deserialize(vectorJson.RemoveBOM().Trim());
if (embeddingData == null)
{
- throw new OrchestrationException($"Unable to deserialize embedding file {file.File.Name}");
+ throw new OrchestrationException($"Unable to deserialize embedding file {embeddingFile.File.Name}");
}
+ embeddingsFound = true;
+
+ DataPipeline.FileDetails fileDetails = pipeline.GetFile(embeddingFile.File.ParentId);
string partitionContent = await this._orchestrator.ReadTextFileAsync(pipeline, embeddingData.SourceFileName, cancellationToken).ConfigureAwait(false);
- string url = await this.GetSourceUrlAsync(pipeline, pipeline.GetFile(file.File.ParentId), cancellationToken).ConfigureAwait(false);
+ string url = await this.GetSourceUrlAsync(pipeline, fileDetails, cancellationToken).ConfigureAwait(false);
var record = PrepareRecord(
pipeline: pipeline,
- recordId: file.RecordId,
- fileName: pipeline.GetFile(file.File.ParentId).Name,
+ recordId: embeddingFile.RecordId,
+ fileName: fileDetails.Name,
url: url,
- fileId: file.File.ParentId,
- partitionFileId: file.File.SourcePartitionId,
+ fileId: embeddingFile.File.ParentId,
+ partitionFileId: embeddingFile.File.SourcePartitionId,
partitionContent: partitionContent,
+#if KernelMemoryDev
+ partitionNumber: embeddingFile.File.PartitionNumber,
+ sectionNumber: embeddingFile.File.SectionNumber,
+#else
+ partitionNumber: 0,
+ sectionNumber: 0,
+#endif
partitionEmbedding: embeddingData.Vector,
embeddingGeneratorProvider: embeddingData.GeneratorProvider,
embeddingGeneratorName: embeddingData.GeneratorName,
- file.File.Tags);
+ embeddingFile.File.Tags);
foreach (IMemoryDb client in this._memoryDbs)
{
@@ -141,7 +154,12 @@ public SaveRecordsHandler(
await client.UpsertAsync(pipeline.Index, record, cancellationToken).ConfigureAwait(false);
}
- file.File.MarkProcessedBy(this);
+ embeddingFile.File.MarkProcessedBy(this);
+ }
+
+ if (!embeddingsFound)
+ {
+ this._log.LogWarning("Pipeline '{0}/{1}': embeddings not found, cannot save embeddings, moving to next pipeline step.", pipeline.Index, pipeline.DocumentId);
}
return (true, pipeline);
@@ -153,31 +171,44 @@ public SaveRecordsHandler(
public async Task<(bool success, DataPipeline updatedPipeline)> SavePartitionsAsync(
DataPipeline pipeline, CancellationToken cancellationToken = default)
{
+ var partitionsFound = false;
+
// Create records only for partitions (text chunks) and synthetic data
foreach (FileDetailsWithRecordId file in GetListOfPartitionAndSyntheticFiles(pipeline))
{
if (file.File.AlreadyProcessedBy(this))
{
+ partitionsFound = true;
this._log.LogTrace("File {0} already processed by this handler", file.File.Name);
continue;
}
+ partitionsFound = true;
+
switch (file.File.MimeType)
{
case MimeTypes.PlainText:
case MimeTypes.MarkDown:
+ DataPipeline.FileDetails partitionFileDetails = pipeline.GetFile(file.File.ParentId);
string partitionContent = await this._orchestrator.ReadTextFileAsync(pipeline, file.File.Name, cancellationToken).ConfigureAwait(false);
- string url = await this.GetSourceUrlAsync(pipeline, pipeline.GetFile(file.File.ParentId), cancellationToken).ConfigureAwait(false);
+ string url = await this.GetSourceUrlAsync(pipeline, partitionFileDetails, cancellationToken).ConfigureAwait(false);
var record = PrepareRecord(
pipeline: pipeline,
recordId: file.RecordId,
- fileName: pipeline.GetFile(file.File.ParentId).Name,
+ fileName: partitionFileDetails.Name,
url: url,
fileId: file.File.ParentId,
partitionFileId: file.File.Id,
partitionContent: partitionContent,
+#if KernelMemoryDev
+ partitionNumber: partitionFileDetails.PartitionNumber,
+ sectionNumber: partitionFileDetails.SectionNumber,
+#else
+ partitionNumber: 0,
+ sectionNumber: 0,
+#endif
partitionEmbedding: new Embedding(),
embeddingGeneratorProvider: "",
embeddingGeneratorName: "",
@@ -202,6 +233,11 @@ public SaveRecordsHandler(
file.File.MarkProcessedBy(this);
}
+ if (!partitionsFound)
+ {
+ this._log.LogWarning("Pipeline '{0}/{1}': partitions and synthetic records not found, cannot save, moving to next pipeline step.", pipeline.Index, pipeline.DocumentId);
+ }
+
return (true, pipeline);
}
@@ -262,6 +298,23 @@ private async Task GetSourceUrlAsync(
return fileContent.ToString();
}
+ ///
+ /// Prepare a records to be saved in memory DB
+ ///
+ /// Pipeline object (TODO: pass only data)
+ /// DB record ID
+ /// Filename
+ /// Web page URL, if any
+ /// ID assigned to the file (note: a document can contain multiple files)
+ /// ID assigned to the partition (or synth) file generated during the import
+ /// Content of the partition
+ /// Number of the partition, starting from zero
+ /// Page number (if the doc is paginated), audio segment number, video scene number, etc.
+ /// Embedding vector calculated from the partition content
+ /// Name of the embedding provider (e.g. Azure), for future use when using multiple embedding types concurrently
+ /// Name of the model used to generate embeddings, for future use
+ /// Collection of tags assigned to the record
+ /// Memory record ready to be saved
private static MemoryRecord PrepareRecord(
DataPipeline pipeline,
string recordId,
@@ -270,6 +323,8 @@ private static MemoryRecord PrepareRecord(
string fileId,
string partitionFileId,
string partitionContent,
+ int partitionNumber,
+ int sectionNumber,
Embedding partitionEmbedding,
string embeddingGeneratorProvider,
string embeddingGeneratorName,
@@ -313,6 +368,12 @@ private static MemoryRecord PrepareRecord(
// Partition ID. Filtering used for purge.
record.Tags.Add(Constants.ReservedFilePartitionTag, partitionFileId);
+#if KernelMemoryDev
+ // Partition number (starting from 0) and Page number (provided by text extractor)
+ record.Tags.Add(Constants.ReservedFilePartitionNumberTag, $"{partitionNumber}");
+ record.Tags.Add(Constants.ReservedFileSectionNumberTag, $"{sectionNumber}");
+#endif
+
/*
* TIMESTAMP and USER TAGS
*/
diff --git a/service/Core/Handlers/TextPartitioningHandler.cs b/service/Core/Handlers/TextPartitioningHandler.cs
index fb524fbca..77133bfb9 100644
--- a/service/Core/Handlers/TextPartitioningHandler.cs
+++ b/service/Core/Handlers/TextPartitioningHandler.cs
@@ -77,6 +77,12 @@ public TextPartitioningHandler(
{
this._log.LogDebug("Partitioning text, pipeline '{0}/{1}'", pipeline.Index, pipeline.DocumentId);
+ if (pipeline.Files.Count == 0)
+ {
+ this._log.LogWarning("Pipeline '{0}/{1}': there are no files to process, moving to next pipeline step.", pipeline.Index, pipeline.DocumentId);
+ return (true, pipeline);
+ }
+
foreach (DataPipeline.FileDetails uploadedFile in pipeline.Files)
{
// Track new files being generated (cannot edit originalFile.GeneratedFiles while looping it)
@@ -99,8 +105,8 @@ public TextPartitioningHandler(
}
// Use a different partitioning strategy depending on the file type
- List paragraphs;
- List lines;
+ List partitions;
+ List sentences;
BinaryData partitionContent = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false);
// Skip empty partitions. Also: partitionContent.ToString() throws an exception if there are no bytes.
@@ -112,9 +118,9 @@ public TextPartitioningHandler(
{
this._log.LogDebug("Partitioning text file {0}", file.Name);
string content = partitionContent.ToString();
- lines = TextChunker.SplitPlainTextLines(content, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter);
- paragraphs = TextChunker.SplitPlainTextParagraphs(
- lines, maxTokensPerParagraph: this._options.MaxTokensPerParagraph, overlapTokens: this._options.OverlappingTokens, tokenCounter: this._tokenCounter);
+ sentences = TextChunker.SplitPlainTextLines(content, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter);
+ partitions = TextChunker.SplitPlainTextParagraphs(
+ sentences, maxTokensPerParagraph: this._options.MaxTokensPerParagraph, overlapTokens: this._options.OverlappingTokens, tokenCounter: this._tokenCounter);
break;
}
@@ -122,9 +128,9 @@ public TextPartitioningHandler(
{
this._log.LogDebug("Partitioning MarkDown file {0}", file.Name);
string content = partitionContent.ToString();
- lines = TextChunker.SplitMarkDownLines(content, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter);
- paragraphs = TextChunker.SplitMarkdownParagraphs(
- lines, maxTokensPerParagraph: this._options.MaxTokensPerParagraph, overlapTokens: this._options.OverlappingTokens, tokenCounter: this._tokenCounter);
+ sentences = TextChunker.SplitMarkDownLines(content, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter);
+ partitions = TextChunker.SplitMarkdownParagraphs(
+ sentences, maxTokensPerParagraph: this._options.MaxTokensPerParagraph, overlapTokens: this._options.OverlappingTokens, tokenCounter: this._tokenCounter);
break;
}
@@ -137,18 +143,22 @@ public TextPartitioningHandler(
continue;
}
- if (paragraphs.Count == 0) { continue; }
+ if (partitions.Count == 0) { continue; }
- this._log.LogDebug("Saving {0} file partitions", paragraphs.Count);
- for (int index = 0; index < paragraphs.Count; index++)
+ this._log.LogDebug("Saving {0} file partitions", partitions.Count);
+ for (int partitionNumber = 0; partitionNumber < partitions.Count; partitionNumber++)
{
- string text = paragraphs[index];
+ // TODO: turn partitions in objects with more details, e.g. page number
+ string text = partitions[partitionNumber];
+#if KernelMemoryDev
+ int sectionNumber = 0; // TODO: use this to store the page number (if any)
+#endif
BinaryData textData = new(text);
int tokenCount = this._tokenCounter(text);
this._log.LogDebug("Partition size: {0} tokens", tokenCount);
- var destFile = uploadedFile.GetPartitionFileName(index);
+ var destFile = uploadedFile.GetPartitionFileName(partitionNumber);
await this._orchestrator.WriteFileAsync(pipeline, destFile, textData, cancellationToken).ConfigureAwait(false);
var destFileDetails = new DataPipeline.GeneratedFileDetails
@@ -159,6 +169,10 @@ public TextPartitioningHandler(
Size = text.Length,
MimeType = MimeTypes.PlainText,
ArtifactType = DataPipeline.ArtifactTypes.TextPartition,
+#if KernelMemoryDev
+ PartitionNumber = partitionNumber,
+ SectionNumber = sectionNumber,
+#endif
Tags = pipeline.Tags,
ContentSHA256 = textData.CalculateSHA256(),
};
diff --git a/service/Core/MemoryStorage/MemoryRecordExtensions.cs b/service/Core/MemoryStorage/MemoryRecordExtensions.cs
new file mode 100644
index 000000000..d229800b4
--- /dev/null
+++ b/service/Core/MemoryStorage/MemoryRecordExtensions.cs
@@ -0,0 +1,138 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using Microsoft.Extensions.Logging;
+
+namespace Microsoft.KernelMemory.MemoryStorage;
+
+///
+/// Extensions of
+///
+#pragma warning disable CA1055 // working with simple types
+public static class MemoryRecordExtensions
+{
+ ///
+ /// Get document ID
+ ///
+ public static string GetDocumentId(this MemoryRecord record, ILogger? log = null)
+ {
+ return record.GetTagValue(Constants.ReservedDocumentIdTag, log);
+ }
+
+ ///
+ /// Get file ID
+ ///
+ public static string GetFileId(this MemoryRecord record, ILogger? log = null)
+ {
+ return record.GetTagValue(Constants.ReservedFileIdTag, log);
+ }
+
+ ///
+ /// Get partition number, starting from zero.
+ ///
+ public static int GetPartitionNumber(this MemoryRecord record, ILogger? log = null)
+ {
+#if KernelMemoryDev
+ var value = record.GetTagValue(Constants.ReservedFilePartitionNumberTag, log);
+ if (string.IsNullOrEmpty(value))
+ {
+ return 0;
+ }
+#else
+ var value = "0";
+#endif
+
+ return int.TryParse(value, out int number) ? number : 0;
+ }
+
+ ///
+ /// Get page number / audio segment number / video scene number
+ ///
+ public static int GetSectionNumber(this MemoryRecord record, ILogger? log = null)
+ {
+#if KernelMemoryDev
+ var value = record.GetTagValue(Constants.ReservedFileSectionNumberTag, log);
+ if (string.IsNullOrEmpty(value))
+ {
+ return 0;
+ }
+#else
+ var value = "0";
+#endif
+
+ return int.TryParse(value, out int number) ? number : 0;
+ }
+
+ ///
+ /// Get file MIME type
+ ///
+ public static string GetFileContentType(this MemoryRecord record, ILogger? log = null)
+ {
+ return record.GetTagValue(Constants.ReservedFileTypeTag, log);
+ }
+
+ ///
+ /// Get web page URL, if the document was a web page
+ ///
+ public static string? GetWebPageUrl(this MemoryRecord record, ILogger? log = null)
+ {
+ var result = record.GetPayloadValue(Constants.ReservedPayloadUrlField, log)?.ToString();
+ return string.IsNullOrWhiteSpace(result) ? null : result;
+ }
+
+ ///
+ /// Get file name
+ ///
+ public static string GetFileName(this MemoryRecord record, ILogger? log = null)
+ {
+ return record.GetPayloadValue(Constants.ReservedPayloadFileNameField, log)?.ToString() ?? string.Empty;
+ }
+
+ ///
+ /// Get file name
+ ///
+ public static string GetPartitionText(this MemoryRecord record, ILogger? log = null)
+ {
+ return record.GetPayloadValue(Constants.ReservedPayloadTextField, log)?.ToString() ?? string.Empty;
+ }
+
+ ///
+ /// Get file name
+ ///
+ public static DateTimeOffset GetLastUpdate(this MemoryRecord record, ILogger? log = null)
+ {
+ var value = record.GetPayloadValue(Constants.ReservedPayloadLastUpdateField, log);
+ return DateTimeOffset.TryParse(value?.ToString() ?? string.Empty, out var date) ? date : DateTimeOffset.MinValue;
+ }
+
+ ///
+ /// Return a memory record tag value if available
+ ///
+ public static string GetTagValue(this MemoryRecord record, string tagName, ILogger? log = null)
+ {
+ if (!record.Tags.TryGetValue(tagName, out List? tagValues))
+ {
+ log?.LogError("Memory record '{0}' doesn't contain a '{1}' tag", record.Id, tagName);
+ return string.Empty;
+ }
+
+ return tagValues.FirstOrDefault() ?? string.Empty;
+ }
+
+ ///
+ /// Return a memory record tag value if available
+ ///
+ public static object? GetPayloadValue(this MemoryRecord record, string payloadKey, ILogger? log = null)
+ {
+ if (!record.Payload.TryGetValue(payloadKey, out object? value))
+ {
+ log?.LogError("Memory record '{0}' doesn't contain a '{1}' payload", record.Id, payloadKey);
+ return null;
+ }
+
+ return value;
+ }
+}
+#pragma warning restore CA1055
diff --git a/service/Core/Search/SearchClient.cs b/service/Core/Search/SearchClient.cs
index 56e427359..56b53e0b4 100644
--- a/service/Core/Search/SearchClient.cs
+++ b/service/Core/Search/SearchClient.cs
@@ -118,47 +118,23 @@ public async Task SearchAsync(
// Memories are sorted by relevance, starting from the most relevant
foreach ((MemoryRecord memory, double relevance) in list)
{
- if (!memory.Tags.ContainsKey(Constants.ReservedDocumentIdTag))
- {
- this._log.LogError("The memory record is missing the '{0}' tag", Constants.ReservedDocumentIdTag);
- }
-
- if (!memory.Tags.ContainsKey(Constants.ReservedFileIdTag))
- {
- this._log.LogError("The memory record is missing the '{0}' tag", Constants.ReservedFileIdTag);
- }
-
- if (!memory.Tags.ContainsKey(Constants.ReservedFileTypeTag))
- {
- this._log.LogError("The memory record is missing the '{0}' tag", Constants.ReservedFileTypeTag);
- }
-
// Note: a document can be composed by multiple files
- string documentId = memory.Tags[Constants.ReservedDocumentIdTag].FirstOrDefault() ?? string.Empty;
+ string documentId = memory.GetDocumentId(this._log);
// Identify the file in case there are multiple files
- string fileId = memory.Tags[Constants.ReservedFileIdTag].FirstOrDefault() ?? string.Empty;
+ string fileId = memory.GetFileId(this._log);
- // TODO: URL to access the file
+ // TODO: URL to access the file in content storage
string linkToFile = $"{index}/{documentId}/{fileId}";
- string fileContentType = memory.Tags[Constants.ReservedFileTypeTag].FirstOrDefault() ?? string.Empty;
- string fileName = memory.Payload[Constants.ReservedPayloadFileNameField].ToString() ?? string.Empty;
-
- // URL the source, used for web pages and external data. Null when empty.
- string? sourceUrl = memory.Payload[Constants.ReservedPayloadUrlField].ToString();
-
- var partitionText = memory.Payload[Constants.ReservedPayloadTextField].ToString()?.Trim() ?? "";
+ var partitionText = memory.GetPartitionText(this._log).Trim();
if (string.IsNullOrEmpty(partitionText))
{
this._log.LogError("The document partition is empty, doc: {0}", memory.Id);
continue;
}
- if (relevance > float.MinValue)
- {
- this._log.LogTrace("Adding result with relevance {0}", relevance);
- }
+ if (relevance > float.MinValue) { this._log.LogTrace("Adding result with relevance {0}", relevance); }
// If the file is already in the list of citations, only add the partition
var citation = result.Results.FirstOrDefault(x => x.Link == linkToFile);
@@ -173,19 +149,19 @@ public async Task SearchAsync(
citation.DocumentId = documentId;
citation.FileId = fileId;
citation.Link = linkToFile;
- citation.SourceContentType = fileContentType;
- citation.SourceName = fileName;
- citation.SourceUrl = string.IsNullOrWhiteSpace(sourceUrl) ? null : sourceUrl;
-
-#pragma warning disable CA1806 // it's ok if parsing fails
- DateTimeOffset.TryParse(memory.Payload[Constants.ReservedPayloadLastUpdateField].ToString(), out var lastUpdate);
-#pragma warning restore CA1806
+ citation.SourceContentType = memory.GetFileContentType(this._log);
+ citation.SourceName = memory.GetFileName(this._log);
+ citation.SourceUrl = memory.GetWebPageUrl();
citation.Partitions.Add(new Citation.Partition
{
Text = partitionText,
Relevance = (float)relevance,
- LastUpdate = lastUpdate,
+#if KernelMemoryDev
+ PartitionNumber = memory.GetPartitionNumber(this._log),
+ SectionNumber = memory.GetSectionNumber(),
+#endif
+ LastUpdate = memory.GetLastUpdate(),
Tags = memory.Tags,
});
}
@@ -246,44 +222,26 @@ public async Task AskAsync(
// Memories are sorted by relevance, starting from the most relevant
await foreach ((MemoryRecord memory, double relevance) in matches.ConfigureAwait(false))
{
- if (!memory.Tags.ContainsKey(Constants.ReservedDocumentIdTag))
- {
- this._log.LogError("The memory record is missing the '{0}' tag", Constants.ReservedDocumentIdTag);
- }
-
- if (!memory.Tags.ContainsKey(Constants.ReservedFileIdTag))
- {
- this._log.LogError("The memory record is missing the '{0}' tag", Constants.ReservedFileIdTag);
- }
-
- if (!memory.Tags.ContainsKey(Constants.ReservedFileTypeTag))
- {
- this._log.LogError("The memory record is missing the '{0}' tag", Constants.ReservedFileTypeTag);
- }
-
// Note: a document can be composed by multiple files
- string documentId = memory.Tags[Constants.ReservedDocumentIdTag].FirstOrDefault() ?? string.Empty;
+ string documentId = memory.GetDocumentId(this._log);
// Identify the file in case there are multiple files
- string fileId = memory.Tags[Constants.ReservedFileIdTag].FirstOrDefault() ?? string.Empty;
+ string fileId = memory.GetFileId(this._log);
- // TODO: URL to access the file
+ // TODO: URL to access the file in content storage
string linkToFile = $"{index}/{documentId}/{fileId}";
- string fileContentType = memory.Tags[Constants.ReservedFileTypeTag].FirstOrDefault() ?? string.Empty;
- string fileName = memory.Payload[Constants.ReservedPayloadFileNameField].ToString() ?? string.Empty;
+ string fileName = memory.GetFileName(this._log);
- // URL the source, used for web pages and external data. Null when empty.
- string? sourceUrl = memory.Payload[Constants.ReservedPayloadUrlField].ToString();
-
- factsAvailableCount++;
- var partitionText = memory.Payload[Constants.ReservedPayloadTextField].ToString()?.Trim() ?? "";
+ var partitionText = memory.GetPartitionText(this._log).Trim();
if (string.IsNullOrEmpty(partitionText))
{
this._log.LogError("The document partition is empty, doc: {0}", memory.Id);
continue;
}
+ factsAvailableCount++;
+
// TODO: add file age in days, to push relevance of newer documents
var fact = $"==== [File:{fileName};Relevance:{relevance:P1}]:\n{partitionText}\n";
@@ -296,7 +254,7 @@ public async Task AskAsync(
}
factsUsedCount++;
- this._log.LogTrace("Adding text {0} with relevance {1}", factsUsedCount, relevance);
+ if (relevance > float.MinValue) { this._log.LogTrace("Adding text {0} with relevance {1}", factsUsedCount, relevance); }
facts.Append(fact);
tokensAvailable -= size;
@@ -314,19 +272,15 @@ public async Task AskAsync(
citation.DocumentId = documentId;
citation.FileId = fileId;
citation.Link = linkToFile;
- citation.SourceContentType = fileContentType;
+ citation.SourceContentType = memory.GetFileContentType(this._log);
citation.SourceName = fileName;
- citation.SourceUrl = string.IsNullOrWhiteSpace(sourceUrl) ? null : sourceUrl;
-
-#pragma warning disable CA1806 // it's ok if parsing fails
- DateTimeOffset.TryParse(memory.Payload[Constants.ReservedPayloadLastUpdateField].ToString(), out var lastUpdate);
-#pragma warning restore CA1806
+ citation.SourceUrl = memory.GetWebPageUrl();
citation.Partitions.Add(new Citation.Partition
{
Text = partitionText,
Relevance = (float)relevance,
- LastUpdate = lastUpdate,
+ LastUpdate = memory.GetLastUpdate(),
Tags = memory.Tags,
});
}