apache · alamb · Nov 9, 2023 · Nov 8, 2023 · Nov 8, 2023 · Nov 8, 2023
diff --git a/datafusion/physical-plan/src/joins/hash_join_utils.rs b/datafusion/physical-plan/src/joins/hash_join_utils.rs
@@ -40,59 +40,67 @@ use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr};
 use hashbrown::raw::RawTable;
 use hashbrown::HashSet;
 
-// Maps a `u64` hash value based on the build side ["on" values] to a list of indices with this key's value.
-// By allocating a `HashMap` with capacity for *at least* the number of rows for entries at the build side,
-// we make sure that we don't have to re-hash the hashmap, which needs access to the key (the hash in this case) value.
-// E.g. 1 -> [3, 6, 8] indicates that the column values map to rows 3, 6 and 8 for hash value 1
-// As the key is a hash value, we need to check possible hash collisions in the probe stage
-// During this stage it might be the case that a row is contained the same hashmap value,
-// but the values don't match. Those are checked in the [equal_rows] macro
-// The indices (values) are stored in a separate chained list stored in the `Vec<u64>`.
-// The first value (+1) is stored in the hashmap, whereas the next value is stored in array at the position value.
-// The chain can be followed until the value "0" has been reached, meaning the end of the list.
-// Also see chapter 5.3 of [Balancing vectorized query execution with bandwidth-optimized storage](https://dare.uva.nl/search?identifier=5ccbb60a-38b8-4eeb-858a-e7735dd37487)
-// See the example below:
-// Insert (1,1)
-// map:
-// ---------
-// | 1 | 2 |
-// ---------
-// next:
-// ---------------------
-// | 0 | 0 | 0 | 0 | 0 |
-// ---------------------
-// Insert (2,2)
-// map:
-// ---------
-// | 1 | 2 |
-// | 2 | 3 |
-// ---------
-// next:
-// ---------------------
-// | 0 | 0 | 0 | 0 | 0 |
-// ---------------------
-// Insert (1,3)
-// map:
-// ---------
-// | 1 | 4 |
-// | 2 | 3 |
-// ---------
-// next:
-// ---------------------
-// | 0 | 0 | 0 | 2 | 0 |  <--- hash value 1 maps to 4,2 (which means indices values 3,1)
-// ---------------------
-// Insert (1,4)
-// map:
-// ---------
-// | 1 | 5 |
-// | 2 | 3 |
-// ---------
-// next:
-// ---------------------
-// | 0 | 0 | 0 | 2 | 4 | <--- hash value 1 maps to 5,4,2 (which means indices values 4,3,1)
-// ---------------------
-// TODO: speed up collision checks
-// https://github.com/apache/arrow-datafusion/issues/50
+/// Maps a `u64` hash value based on the build side ["on" values] to a list of indices with this key's value.
+///
+/// By allocating a `HashMap` with capacity for *at least* the number of rows for entries at the build side,
+/// we make sure that we don't have to re-hash the hashmap, which needs access to the key (the hash in this case) value.
+///
+/// E.g. 1 -> [3, 6, 8] indicates that the column values map to rows 3, 6 and 8 for hash value 1
+/// As the key is a hash value, we need to check possible hash collisions in the probe stage
+/// During this stage it might be the case that a row is contained the same hashmap value,
+/// but the values don't match. Those are checked in the [`equal_rows_arr`](crate::joins::hash_join::equal_rows_arr) method.
+///
+/// The indices (values) are stored in a separate chained list stored in the `Vec<u64>`.
+///
+/// The first value (+1) is stored in the hashmap, whereas the next value is stored in array at the position value.
+///
+/// The chain can be followed until the value "0" has been reached, meaning the end of the list.
+/// Also see chapter 5.3 of [Balancing vectorized query execution with bandwidth-optimized storage](https://dare.uva.nl/search?identifier=5ccbb60a-38b8-4eeb-858a-e7735dd37487)
+///
+/// # Example
+///
+/// ``` text
+/// See the example below:
+/// Insert (1,1)
+/// map:
+/// ---------
+/// | 1 | 2 |
+/// ---------
+/// next:
+/// ---------------------
+/// | 0 | 0 | 0 | 0 | 0 |
+/// ---------------------
+/// Insert (2,2)
+/// map:
+/// ---------
+/// | 1 | 2 |
+/// | 2 | 3 |
+/// ---------
+/// next:
+/// ---------------------
+/// | 0 | 0 | 0 | 0 | 0 |
+/// ---------------------
+/// Insert (1,3)
+/// map:
+/// ---------
+/// | 1 | 4 |
+/// | 2 | 3 |
+/// ---------
+/// next:
+/// ---------------------
+/// | 0 | 0 | 0 | 2 | 0 |  <--- hash value 1 maps to 4,2 (which means indices values 3,1)
+/// ---------------------
+/// Insert (1,4)
+/// map:
+/// ---------
+/// | 1 | 5 |
+/// | 2 | 3 |
+/// ---------
+/// next:
+/// ---------------------
+/// | 0 | 0 | 0 | 2 | 4 | <--- hash value 1 maps to 5,4,2 (which means indices values 4,3,1)
+/// ---------------------
+/// ```
 pub struct JoinHashMap {
     // Stores hash value to last row index
     pub map: RawTable<(u64, u64)>,