Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
679cb09
Inject profiling for function calls to 'halide_copy_to_host' and 'hal…
mcourteaux Oct 23, 2023
0711df6
WIP: I get segfaults. The device_interface pointer is bogus.
mcourteaux Nov 2, 2023
0423c96
Figured it out...
mcourteaux Nov 2, 2023
75731f2
Allow global sync on d3d12.
mcourteaux Nov 3, 2023
11a9778
Cleanly time all buffer copies as well.
mcourteaux Nov 3, 2023
e0698f5
Cleanup old comment.
mcourteaux Nov 3, 2023
5c22550
Following Andrews suggestion for suffixing buffer copies in the profi…
mcourteaux Nov 13, 2023
86638a3
Sort the profiler report lines into three sections: funcs, buffer cop…
mcourteaux Nov 17, 2023
28fedfd
Merge branch 'main' into pr/7913
steven-johnson Nov 28, 2023
2bcdf7c
Merge branch 'main' into pr/7913
steven-johnson Nov 28, 2023
d7c9dee
Merge branch 'main' into pr/7913
steven-johnson Nov 29, 2023
d65554b
Inject profiling for function calls to 'halide_copy_to_host' and 'hal…
mcourteaux Oct 23, 2023
39241df
WIP: I get segfaults. The device_interface pointer is bogus.
mcourteaux Nov 2, 2023
d8e25ec
Figured it out...
mcourteaux Nov 2, 2023
249c10d
Allow global sync on d3d12.
mcourteaux Nov 3, 2023
8c22a2e
Cleanly time all buffer copies as well.
mcourteaux Nov 3, 2023
16819e6
Cleanup old comment.
mcourteaux Nov 3, 2023
2121c35
Following Andrews suggestion for suffixing buffer copies in the profi…
mcourteaux Nov 13, 2023
511c06a
Sort the profiler report lines into three sections: funcs, buffer cop…
mcourteaux Nov 17, 2023
7f6478f
Attempt to fix output parsing.
mcourteaux Nov 30, 2023
902de94
Merge branch 'profile-device-copies' of https://github.com/mcourteaux…
steven-johnson Nov 30, 2023
6f817d3
Fix crash for copy_to_device
steven-johnson Nov 30, 2023
ddb8bbf
halide_device_sync_global(NULL) -> success
steven-johnson Nov 30, 2023
aaa448c
Fixed the buffer copy bug. Added a new test that will cause buffer co…
mcourteaux Dec 1, 2023
37739b4
Clang-format, my dear friend...
mcourteaux Dec 1, 2023
4098f49
Merge branch 'main' into pr/7913
steven-johnson Dec 8, 2023
5c9fd01
Merge branch 'profile-device-copies' of https://github.com/mcourteaux…
steven-johnson Dec 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/CodeGen_Internal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ bool function_takes_user_context(const std::string &name) {
"halide_device_malloc",
"halide_device_and_host_malloc",
"halide_device_sync",
"halide_device_sync_global",
"halide_do_par_for",
"halide_do_loop_task",
"halide_do_task",
Expand Down
8 changes: 7 additions & 1 deletion src/OffloadGPULoops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,13 @@ class InjectGpuOffload : public IRMutator {
Call::make(Handle(), Call::make_struct, args, Call::Intrinsic),
Call::make(Handle(), Call::make_struct, arg_is_buffer, Call::Intrinsic),
};
return call_extern_and_assert("halide_" + api_unique_name + "_run", run_args);
Stmt run_and_assert = call_extern_and_assert("halide_" + api_unique_name + "_run", run_args);
if (target.has_feature(Target::Profile) || target.has_feature(Target::ProfileByTimer)) {
Expr device_interface = make_device_interface_call(loop->device_api, MemoryType::Auto);
Stmt sync_and_assert = call_extern_and_assert("halide_device_sync_global", {device_interface});
return Block::make(run_and_assert, sync_and_assert);
}
return run_and_assert;
}

public:
Expand Down
72 changes: 72 additions & 0 deletions src/Profiling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "ExprUsesVar.h"
#include "IRMutator.h"
#include "IROperator.h"
#include "InjectHostDevBufferCopies.h"
#include "Profiling.h"
#include "Scope.h"
#include "Simplify.h"
Expand Down Expand Up @@ -422,6 +423,77 @@ class InjectProfiling : public IRMutator {
}
return IfThenElse::make(std::move(condition), std::move(then_case), std::move(else_case));
}

Stmt visit(const LetStmt *op) override {
if (const Call *call = op->value.as<Call>()) {
Stmt start_profiler;
if (call->name == "halide_copy_to_host" || call->name == "halide_copy_to_device") {
std::string buffer_name;
if (const Variable *var = call->args.front().as<Variable>()) {
buffer_name = var->name;
if (ends_with(buffer_name, ".buffer")) {
buffer_name = buffer_name.substr(0, buffer_name.size() - 7);
} else {
internal_error << "Expected to find a variable ending in .buffer as first argument to function call " << call->name << "\n";
}
} else {
internal_error << "Expected to find a variable as first argument of the function call " << call->name << ".\n";
}
bool requires_sync = false;
if (call->name == "halide_copy_to_host") {
int copy_to_host_id = get_func_id(buffer_name + " (copy to host)");
start_profiler = set_current_func(copy_to_host_id);
requires_sync = false;
} else if (call->name == "halide_copy_to_device") {
int copy_to_device_id = get_func_id(buffer_name + " (copy to device)");
start_profiler = set_current_func(copy_to_device_id);
requires_sync = true;
} else {
internal_error << "Unexpected function name.\n";
}
if (start_profiler.defined()) {
// The copy functions are followed by an assert, which we will wrap in the timed body.
const AssertStmt *copy_assert = nullptr;
Stmt other;
if (const Block *block = op->body.as<Block>()) {
if (const AssertStmt *assert = block->first.as<AssertStmt>()) {
copy_assert = assert;
other = block->rest;
}
} else if (const AssertStmt *assert = op->body.as<AssertStmt>()) {
copy_assert = assert;
}
if (copy_assert) {
std::vector<Stmt> steps;
steps.push_back(AssertStmt::make(copy_assert->condition, copy_assert->message));
if (requires_sync) {
internal_assert(call->name == "halide_copy_to_device");
Expr device_interface = call->args.back(); // The last argument to the copy_to_device calls is the device_interface.
Stmt sync_and_assert = call_extern_and_assert("halide_device_sync_global", {device_interface});
steps.push_back(sync_and_assert);
}
steps.push_back(set_current_func(stack.back()));

if (other.defined()) {
steps.push_back(mutate(other));
}
return Block::make(start_profiler,
LetStmt::make(op->name, mutate(op->value),
Block::make(steps)));
} else {
internal_error << "No assert found after buffer copy.\n";
}
}
}
}

Stmt body = mutate(op->body);
Expr value = mutate(op->value);
if (body.same_as(op->body) && value.same_as(op->value)) {
return op;
}
return LetStmt::make(op->name, value, body);
}
};

} // namespace
Expand Down
9 changes: 9 additions & 0 deletions src/runtime/HalideRuntime.h
Original file line number Diff line number Diff line change
Expand Up @@ -885,6 +885,15 @@ extern int halide_device_release_crop(void *user_context,
* should rarely be necessary, except maybe for profiling. */
extern int halide_device_sync(void *user_context, struct halide_buffer_t *buf);

/**
* Wait for current GPU operations to complete. Calling this explicitly
* should rarely be necessary, except maybe for profiling.
* This variation of the synchronizing is useful when a synchronization is desirable
* without specifying any buffer to synchronize on.
* Calling this with a null device_interface is always illegal.
*/
extern int halide_device_sync_global(void *user_context, const struct halide_device_interface_t *device_interface);

/** Allocate device memory to back a halide_buffer_t. */
extern int halide_device_malloc(void *user_context, struct halide_buffer_t *buf,
const struct halide_device_interface_t *device_interface);
Expand Down
8 changes: 6 additions & 2 deletions src/runtime/d3d12compute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2786,8 +2786,12 @@ WEAK int halide_d3d12compute_device_sync(void *user_context, struct halide_buffe
return d3d12_context.error();
}

d3d12_buffer *dbuffer = peel_buffer(buffer);
d3d12compute_device_sync_internal(d3d12_context.device, dbuffer);
if (buffer != nullptr) {
d3d12_buffer *dbuffer = peel_buffer(buffer);
d3d12compute_device_sync_internal(d3d12_context.device, dbuffer);
} else {
d3d12compute_device_sync_internal(d3d12_context.device, nullptr);
}

return halide_error_code_success;
}
Expand Down
15 changes: 15 additions & 0 deletions src/runtime/device_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,21 @@ WEAK int halide_device_sync(void *user_context, struct halide_buffer_t *buf) {
}
}

/**
* Wait for current GPU operations to complete. Calling this explicitly
* should rarely be necessary, except maybe for profiling.
* This variation of the synchronizing is useful when a synchronization is desirable
* without specifying any buffer to synchronize on.
*/
WEAK int halide_device_sync_global(void *user_context, const struct halide_device_interface_t *device_interface) {
if (device_interface == nullptr) {
return halide_error_code_no_device_interface;
}
// This function calls immediately the device_interface implementation to syncrhonize on
// "no buffer" (i.e., nullptr buffer) to trigger a "global" device sync.
return device_interface->impl->device_sync(user_context, nullptr);
}

/** Allocate device memory to back a halide_buffer_t. */
WEAK int halide_device_malloc(void *user_context, struct halide_buffer_t *buf,
const halide_device_interface_t *device_interface) {
Expand Down
128 changes: 111 additions & 17 deletions src/runtime/profiler_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,14 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
};
}
}
bool support_colors = false;
const char *term = getenv("TERM");
if (term) {
// Check if the terminal supports colors
if (strstr(term, "color") || strstr(term, "xterm")) {
support_colors = true;
}
}

for (halide_profiler_pipeline_stats *p = s->pipelines; p;
p = (halide_profiler_pipeline_stats *)(p->next)) {
Expand Down Expand Up @@ -385,14 +393,31 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
if (print_f_states) {
int f_stats_count = 0;
halide_profiler_func_stats **f_stats = (halide_profiler_func_stats **)__builtin_alloca(p->num_funcs * sizeof(halide_profiler_func_stats *));
const char *substr_copy_to_device = " (copy to device)";
const char *substr_copy_to_host = " (copy to host)";

int max_func_name_length = 23; // length of the section header
int num_copy_to_device = 0;
int num_copy_to_host = 0;

int max_func_name_length = 0;
uint64_t total_func_time = 0;
uint64_t total_copy_to_device_time = 0;
uint64_t total_copy_to_host_time = 0;
for (int i = 0; i < p->num_funcs; i++) {
halide_profiler_func_stats *fs = p->funcs + i;
int name_len = strlen(fs->name);
if (name_len > max_func_name_length) {
max_func_name_length = name_len;
}
if (strstr(fs->name, substr_copy_to_device)) {
num_copy_to_device++;
total_copy_to_device_time += fs->time;
} else if (strstr(fs->name, substr_copy_to_host)) {
num_copy_to_host++;
total_copy_to_host_time += fs->time;
} else {
total_func_time += fs->time;
}
}

for (int i = 0; i < p->num_funcs; i++) {
Expand All @@ -418,18 +443,8 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
}
}

for (int i = 0; i < f_stats_count; i++) {
size_t cursor = 0;
sstr.clear();
halide_profiler_func_stats *fs = f_stats[i];

sstr << " " << fs->name << ": ";
cursor += max_func_name_length + 5;
while (sstr.size() < cursor) {
sstr << " ";
}

float ft = fs->time / (p->runs * 1000000.0f);
const auto print_time_and_percentage = [&sstr, p](uint64_t time, size_t &cursor, bool light) {
float ft = time / (p->runs * 1000000.0f);
if (ft < 10000) {
sstr << " ";
}
Expand All @@ -451,16 +466,40 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
sstr << " ";
}

int percent = 0;
int perthousand = 0;
if (p->time != 0) {
percent = (100 * fs->time) / p->time;
perthousand = (1000 * time) / p->time;
}
sstr << "(";
if (perthousand < 100) {
sstr << " ";
}
sstr << "(" << percent << "%)";
cursor += 8;
int percent = perthousand / 10;
sstr << percent << "." << (perthousand - percent * 10) << "%)";
if (!light) {
cursor += 10;
while (sstr.size() < cursor) {
sstr << " ";
}
}
};

auto print_report_entry = [&](halide_profiler_func_stats *fs, const char *suffix_cut) {
size_t cursor = 0;
sstr.clear();

sstr << " " << fs->name;
if (suffix_cut) {
sstr.erase(strlen(suffix_cut));
}
sstr << ": ";
cursor += max_func_name_length + 7;
while (sstr.size() < cursor) {
sstr << " ";
}

print_time_and_percentage(fs->time, cursor, false);

if (!serial) {
float threads = fs->active_threads_numerator / (fs->active_threads_denominator + 1e-10);
sstr << "threads: " << threads;
Expand Down Expand Up @@ -494,6 +533,61 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
sstr << "\n";

halide_print(user_context, sstr.str());
};

if (num_copy_to_host == 0 && num_copy_to_device == 0) {
for (int i = 0; i < f_stats_count; i++) {
halide_profiler_func_stats *fs = f_stats[i];
print_report_entry(fs, nullptr);
}
} else {
const auto print_section_header = [&](const char *name, uint64_t total_time) {
size_t cursor = 0;
sstr.clear();
sstr << " ";
if (support_colors) {
sstr << "\033[90m\033[3m";
cursor += 9;
}
sstr << "[" << name << " ";
cursor += max_func_name_length + 7;
while (sstr.size() < cursor) {
sstr << ":";
}
print_time_and_percentage(total_time, cursor, true);
sstr << " ::::]";
if (support_colors) {
sstr << "\033[0m";
}
sstr << "\n";
halide_print(user_context, sstr.str());
};

print_section_header("funcs", total_func_time);
for (int i = 0; i < f_stats_count; i++) {
halide_profiler_func_stats *fs = f_stats[i];
if (!strstr(fs->name, substr_copy_to_device) && !strstr(fs->name, substr_copy_to_host)) {
print_report_entry(fs, nullptr);
}
}
if (num_copy_to_device) {
print_section_header("buffer copies to device", total_copy_to_device_time);
for (int i = 0; i < f_stats_count; i++) {
halide_profiler_func_stats *fs = f_stats[i];
if (strstr(fs->name, substr_copy_to_device)) {
print_report_entry(fs, substr_copy_to_device);
}
}
}
if (num_copy_to_host) {
print_section_header("buffer copies to host", total_copy_to_host_time);
for (int i = 0; i < f_stats_count; i++) {
halide_profiler_func_stats *fs = f_stats[i];
if (strstr(fs->name, substr_copy_to_host)) {
print_report_entry(fs, substr_copy_to_host);
}
}
}
}
}
}
Expand Down
1 change: 1 addition & 0 deletions src/runtime/runtime_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = {
(void *)&halide_device_malloc,
(void *)&halide_device_release,
(void *)&halide_device_sync,
(void *)&halide_device_sync_global,
(void *)&halide_disable_timer_interrupt,
(void *)&halide_do_par_for,
(void *)&halide_do_parallel_tasks,
Expand Down
1 change: 1 addition & 0 deletions test/correctness/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ tests(GROUPS correctness
debug_to_file_reorder.cpp
deferred_loop_level.cpp
deinterleave4.cpp
device_buffer_copies_with_profile.cpp
device_buffer_copy.cpp
device_copy_at_inner_loop.cpp
device_crop.cpp
Expand Down
Loading