-
Notifications
You must be signed in to change notification settings - Fork 394
feat: add runtime cache API for TensorRT-RTX #4180
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -92,6 +92,7 @@ def cross_compile_for_windows( | |
| dryrun: bool = _defaults.DRYRUN, | ||
| hardware_compatible: bool = _defaults.HARDWARE_COMPATIBLE, | ||
| timing_cache_path: str = _defaults.TIMING_CACHE_PATH, | ||
| runtime_cache_path: str = _defaults.RUNTIME_CACHE_PATH, | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Runtime cache is a JIT-time API : it may not much make sense for
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. agree, it doesn't make sense in JIT-time cache.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great, thanks for the feedback Lan 🙏
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added in 3893fa4 which emits a warning. |
||
| lazy_engine_init: bool = _defaults.LAZY_ENGINE_INIT, | ||
| cache_built_engines: bool = _defaults.CACHE_BUILT_ENGINES, | ||
| reuse_cached_engines: bool = _defaults.REUSE_CACHED_ENGINES, | ||
|
|
@@ -170,7 +171,8 @@ def cross_compile_for_windows( | |
| enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT. | ||
| dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs | ||
| hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer) | ||
| timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation | ||
| timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation. Not used for TensorRT-RTX. | ||
| runtime_cache_path (str): Path to the runtime cache for TensorRT-RTX JIT compilation results. Not used for standard TensorRT. | ||
| lazy_engine_init (bool): Defer setting up engines until the compilation of all engines is complete. Can allow larger models with multiple graph breaks to compile but can lead to oversubscription of GPU memory at runtime. | ||
| cache_built_engines (bool): Whether to save the compiled TRT engines to storage | ||
| reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage | ||
|
|
@@ -334,6 +336,7 @@ def cross_compile_for_windows( | |
| "dryrun": dryrun, | ||
| "hardware_compatible": hardware_compatible, | ||
| "timing_cache_path": timing_cache_path, | ||
| "runtime_cache_path": runtime_cache_path, | ||
| "lazy_engine_init": lazy_engine_init, | ||
| "cache_built_engines": cache_built_engines, | ||
| "reuse_cached_engines": reuse_cached_engines, | ||
|
|
@@ -366,6 +369,12 @@ def cross_compile_for_windows( | |
| f"arg: {key} is not supported for cross compilation for windows feature, hence it is disabled." | ||
| ) | ||
|
|
||
| if "runtime_cache_path" in compilation_options: | ||
| compilation_options.pop("runtime_cache_path") | ||
| logger.warning( | ||
| "runtime_cache_path is a JIT-time API and is not applicable to cross compilation for windows. Ignoring." | ||
| ) | ||
|
|
||
| settings = CompilationSettings(**compilation_options) | ||
| logger.info("Compilation Settings: %s\n", settings) | ||
| exported_program = pre_export_lowering(exported_program, settings) | ||
|
|
@@ -438,6 +447,7 @@ def compile( | |
| dryrun: bool = _defaults.DRYRUN, | ||
| hardware_compatible: bool = _defaults.HARDWARE_COMPATIBLE, | ||
| timing_cache_path: str = _defaults.TIMING_CACHE_PATH, | ||
| runtime_cache_path: str = _defaults.RUNTIME_CACHE_PATH, | ||
| lazy_engine_init: bool = _defaults.LAZY_ENGINE_INIT, | ||
| cache_built_engines: bool = _defaults.CACHE_BUILT_ENGINES, | ||
| reuse_cached_engines: bool = _defaults.REUSE_CACHED_ENGINES, | ||
|
|
@@ -531,7 +541,8 @@ def compile( | |
| enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT. | ||
| dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs | ||
| hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer) | ||
| timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation | ||
| timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation. Not used for TensorRT-RTX. | ||
| runtime_cache_path (str): Path to the runtime cache for TensorRT-RTX JIT compilation results. Not used for standard TensorRT. | ||
| lazy_engine_init (bool): Defer setting up engines until the compilation of all engines is complete. Can allow larger models with multiple graph breaks to compile but can lead to oversubscription of GPU memory at runtime. | ||
| cache_built_engines (bool): Whether to save the compiled TRT engines to storage | ||
| reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage | ||
|
|
@@ -738,6 +749,7 @@ def compile( | |
| "dryrun": dryrun, | ||
| "hardware_compatible": hardware_compatible, | ||
| "timing_cache_path": timing_cache_path, | ||
| "runtime_cache_path": runtime_cache_path, | ||
| "lazy_engine_init": lazy_engine_init, | ||
| "cache_built_engines": cache_built_engines, | ||
| "reuse_cached_engines": reuse_cached_engines, | ||
|
|
@@ -1150,6 +1162,7 @@ def convert_exported_program_to_serialized_trt_engine( | |
| dryrun: bool = _defaults.DRYRUN, | ||
| hardware_compatible: bool = _defaults.HARDWARE_COMPATIBLE, | ||
| timing_cache_path: str = _defaults.TIMING_CACHE_PATH, | ||
| runtime_cache_path: str = _defaults.RUNTIME_CACHE_PATH, | ||
| lazy_engine_init: bool = _defaults.LAZY_ENGINE_INIT, | ||
| cache_built_engines: bool = _defaults.CACHE_BUILT_ENGINES, | ||
| reuse_cached_engines: bool = _defaults.REUSE_CACHED_ENGINES, | ||
|
|
@@ -1224,7 +1237,8 @@ def convert_exported_program_to_serialized_trt_engine( | |
| enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT. | ||
| dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs | ||
| hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer) | ||
| timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation | ||
| timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation. Not used for TensorRT-RTX. | ||
| runtime_cache_path (str): Path to the runtime cache for TensorRT-RTX JIT compilation results. Not used for standard TensorRT. | ||
| lazy_engine_init (bool): Defer setting up engines until the compilation of all engines is complete. Can allow larger models with multiple graph breaks to compile but can lead to oversubscription of GPU memory at runtime. | ||
| cache_built_engines (bool): Whether to save the compiled TRT engines to storage | ||
| reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage | ||
|
|
@@ -1397,6 +1411,7 @@ def convert_exported_program_to_serialized_trt_engine( | |
| "dryrun": dryrun, | ||
| "hardware_compatible": hardware_compatible, | ||
| "timing_cache_path": timing_cache_path, | ||
| "runtime_cache_path": runtime_cache_path, | ||
| "lazy_engine_init": lazy_engine_init, | ||
| "cache_built_engines": cache_built_engines, | ||
| "reuse_cached_engines": reuse_cached_engines, | ||
|
|
@@ -1413,6 +1428,11 @@ def convert_exported_program_to_serialized_trt_engine( | |
| "use_distributed_mode_trace": use_distributed_mode_trace, | ||
| "decompose_attention": decompose_attention, | ||
| } | ||
| if "runtime_cache_path" in compilation_options: | ||
| compilation_options.pop("runtime_cache_path") | ||
| logger.warning( | ||
| "runtime_cache_path is a JIT-time API and is not applicable to serialized engine export. Ignoring." | ||
| ) | ||
|
|
||
| settings = CompilationSettings(**compilation_options) | ||
| logger.info("Compilation Settings: %s\n", settings) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.