Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,13 @@ All notable changes to this project will be documented in this file.

- `operator-rs` `0.56.1` -> `0.57.0` ([#433]).

### Fixed

- Include hdfs principals `dfs.journalnode.kerberos.principal`, `dfs.namenode.kerberos.principal`
and `dfs.datanode.kerberos.principal` in the discovery ConfigMap in case Kerberos is enabled ([#451]).

[#433]: https://github.com/stackabletech/hdfs-operator/pull/433
[#451]: https://github.com/stackabletech/hdfs-operator/pull/451

## [23.11.0] - 2023-11-24

Expand Down
11 changes: 10 additions & 1 deletion docs/modules/hdfs/pages/reference/discovery.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,13 @@ The ConfigMap data values are formatted as Hadoop XML files which allows simple
Contains the `fs.DefaultFS` which defaults to `hdfs://{clusterName}/`.

`hdfs-site.xml`::
Contains the `dfs.namenode.*` properties for `rpc` and `http` addresses for the `namenodes` as well as the `dfs.nameservices` property which defaults to `hdfs://{clusterName}/`.
Contains the `dfs.namenode.*` properties for `rpc` and `http` addresses for the `namenodes` as well as the `dfs.nameservices` property which defaults to `hdfs://{clusterName}/`.

=== Kerberos
In case Kerberos is enabled according to the xref:usage-guide/security.adoc[security documentation], the discovery ConfigMap also includes the information that clients must authenticate themselves using Kerberos.

Some Kerberos-related configuration settings require the environment variable `KERBEROS_REALM` to be set (e.g. using `export KERBEROS_REALM=$(grep -oP 'default_realm = \K.*' /stackable/kerberos/krb5.conf)`).
If you want to use the discovery ConfigMap outside Stackable services, you need to provide this environment variable.
As an alternative you can substitute `${env.KERBEROS_REALM}` with your actual realm (e.g. by using `sed -i -e 's/${{env.KERBEROS_REALM}}/'"$KERBEROS_REALM/g" core-site.xml`).

One example would be the property `dfs.namenode.kerberos.principal` being set to `nn/hdfs.default.svc.cluster.local@${env.KERBEROS_REALM}`.
5 changes: 5 additions & 0 deletions rust/crd/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,11 @@ impl HdfsRole {
}

impl HdfsCluster {
/// Return the namespace of the cluster or an error in case it is not set.
pub fn namespace_or_error(&self) -> Result<String, Error> {
self.namespace().context(NoNamespaceSnafu)
}

/// Kubernetes labels to attach to Pods within a role group.
///
/// The same labels are also used as selectors for Services and StatefulSets.
Expand Down
26 changes: 17 additions & 9 deletions rust/operator-binary/src/discovery.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use crate::{
build_recommended_labels,
config::{CoreSiteConfigBuilder, HdfsSiteConfigBuilder},
hdfs_controller::Error,
};
use stackable_hdfs_crd::{
constants::{CORE_SITE_XML, HDFS_SITE_XML},
Expand All @@ -9,9 +10,8 @@ use stackable_hdfs_crd::{
use stackable_operator::{
builder::{ConfigMapBuilder, ObjectMetaBuilder},
commons::product_image_selection::ResolvedProductImage,
error::OperatorResult,
k8s_openapi::api::core::v1::ConfigMap,
kube::ResourceExt,
kube::{runtime::reflector::ObjectRef, ResourceExt},
};

/// Creates a discovery config map containing the `hdfs-site.xml` and `core-site.xml`
Expand All @@ -21,12 +21,16 @@ pub fn build_discovery_configmap(
controller: &str,
namenode_podrefs: &[HdfsPodRef],
resolved_product_image: &ResolvedProductImage,
) -> OperatorResult<ConfigMap> {
) -> Result<ConfigMap, crate::hdfs_controller::Error> {
ConfigMapBuilder::new()
.metadata(
ObjectMetaBuilder::new()
.name_and_namespace(hdfs)
.ownerreference_from_resource(hdfs, None, Some(true))?
.ownerreference_from_resource(hdfs, None, Some(true))
.map_err(|err| Error::ObjectMissingMetadataForOwnerRef {
source: err,
obj_ref: ObjectRef::from_obj(hdfs),
})?
.with_recommended_labels(build_recommended_labels(
hdfs,
controller,
Expand All @@ -42,9 +46,10 @@ pub fn build_discovery_configmap(
)
.add_data(
CORE_SITE_XML,
build_discovery_core_site_xml(hdfs, hdfs.name_any()),
build_discovery_core_site_xml(hdfs, hdfs.name_any())?,
)
.build()
.map_err(|err| Error::BuildDiscoveryConfigMap { source: err })
}

fn build_discovery_hdfs_site_xml(
Expand All @@ -62,9 +67,12 @@ fn build_discovery_hdfs_site_xml(
.build_as_xml()
}

fn build_discovery_core_site_xml(hdfs: &HdfsCluster, logical_name: String) -> String {
CoreSiteConfigBuilder::new(logical_name)
fn build_discovery_core_site_xml(
hdfs: &HdfsCluster,
logical_name: String,
) -> Result<String, Error> {
Ok(CoreSiteConfigBuilder::new(logical_name)
.fs_default_fs()
.security_discovery_config(hdfs)
.build_as_xml()
.security_discovery_config(hdfs)?
.build_as_xml())
}
10 changes: 2 additions & 8 deletions rust/operator-binary/src/hdfs_controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -277,8 +277,7 @@ pub async fn reconcile_hdfs(hdfs: Arc<HdfsCluster>, ctx: Arc<Ctx>) -> HdfsOperat
HDFS_CONTROLLER,
&namenode_podrefs,
&resolved_product_image,
)
.context(BuildDiscoveryConfigMapSnafu)?;
)?;

// The discovery CM is linked to the cluster lifecycle via ownerreference.
// Therefore, must not be added to the "orphaned" cluster resources
Expand Down Expand Up @@ -482,11 +481,6 @@ fn rolegroup_config_map(
.with_context(|| ObjectHasNoNameSnafu {
obj_ref: ObjectRef::from_obj(hdfs),
})?;
let hdfs_namespace = hdfs
.namespace()
.with_context(|| ObjectHasNoNamespaceSnafu {
obj_ref: ObjectRef::from_obj(hdfs),
})?;

let mut hdfs_site_xml = String::new();
let mut core_site_xml = String::new();
Expand Down Expand Up @@ -525,7 +519,7 @@ fn rolegroup_config_map(
core_site_xml = CoreSiteConfigBuilder::new(hdfs_name.to_string())
.fs_default_fs()
.ha_zookeeper_quorum()
.security_config(hdfs, hdfs_name, &hdfs_namespace)
.security_config(hdfs)?
// the extend with config must come last in order to have overrides working!!!
.extend(config)
.build_as_xml();
Expand Down
77 changes: 52 additions & 25 deletions rust/operator-binary/src/kerberos.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ use stackable_hdfs_crd::{
constants::{SSL_CLIENT_XML, SSL_SERVER_XML},
HdfsCluster,
};
use stackable_operator::commons::product_image_selection::ResolvedProductImage;
use stackable_operator::{
commons::product_image_selection::ResolvedProductImage,
kube::{runtime::reflector::ObjectRef, ResourceExt},
};

use crate::{
config::{CoreSiteConfigBuilder, HdfsSiteConfigBuilder},
Expand Down Expand Up @@ -52,29 +55,14 @@ impl HdfsSiteConfigBuilder {
}

impl CoreSiteConfigBuilder {
pub fn security_config(
&mut self,
hdfs: &HdfsCluster,
hdfs_name: &str,
hdfs_namespace: &str,
) -> &mut Self {
pub fn security_config(&mut self, hdfs: &HdfsCluster) -> Result<&mut Self, Error> {
if hdfs.authentication_config().is_some() {
// For a long time we tried using `_HOST` in principals, e.g. `jn/_HOST@REALM.COM`.
// Turns out there are a lot of code paths that check the principal of the requester using a reverse lookup of the incoming IP address
// and getting a different hostname than the principal has.
// What ultimately killed this approach was
//
// 2023-05-30 09:23:01,745 ERROR namenode.EditLogInputStream (EditLogFileInputStream.java:nextOpImpl(220)) - caught exception initializing https://hdfs-journalnode-default-1.hdfs-journalnode-default.kuttl-test-fine-rat.svc.cluster.local:8481/getJournal?jid=hdfs&segmentTxId=1&storageInfo=-65%3A595659877%3A1685437352616%3ACID-90c52400-5b07-49bf-bdbe-3469bbdc5ebb&inProgressOk=true
// org.apache.hadoop.hdfs.server.common.HttpGetFailedException: Fetch of https://hdfs-journalnode-default-1.hdfs-journalnode-default.kuttl-test-fine-rat.svc.cluster.local:8481/getJournal?jid=hdfs&segmentTxId=1&storageInfo=-65%3A595659877%3A1685437352616%3ACID-90c52400-5b07-49bf-bdbe-3469bbdc5ebb&inProgressOk=true failed with status code 403
// Response message:
// Only Namenode and another JournalNode may access this servlet
//
// After we have switched to using the following principals everything worked without problems
let principal_host_part = principal_host_part(hdfs)?;

let principal_host_part =
format!("{hdfs_name}.{hdfs_namespace}.svc.cluster.local@${{env.KERBEROS_REALM}}");
self.add("hadoop.security.authentication", "kerberos")
.add("hadoop.registry.kerberos.realm", "${env.KERBEROS_REALM}")
// Not adding hadoop.registry.kerberos.realm, as it seems to not be used by our customers
// and would need text-replacement of the env var anyway.
// .add("hadoop.registry.kerberos.realm", "${env.KERBEROS_REALM}")
.add(
"dfs.journalnode.kerberos.principal",
format!("jn/{principal_host_part}"),
Expand Down Expand Up @@ -115,19 +103,58 @@ impl CoreSiteConfigBuilder {

self.add_wire_encryption_settings();
}
self
Ok(self)
}

pub fn security_discovery_config(&mut self, hdfs: &HdfsCluster) -> &mut Self {
pub fn security_discovery_config(&mut self, hdfs: &HdfsCluster) -> Result<&mut Self, Error> {
if hdfs.has_kerberos_enabled() {
self.add("hadoop.security.authentication", "kerberos");
let principal_host_part = principal_host_part(hdfs)?;

self.add("hadoop.security.authentication", "kerberos")
.add(
"dfs.journalnode.kerberos.principal",
format!("jn/{principal_host_part}"),
)
.add(
"dfs.namenode.kerberos.principal",
format!("nn/{principal_host_part}"),
)
.add(
"dfs.datanode.kerberos.principal",
format!("dn/{principal_host_part}"),
);
self.add_wire_encryption_settings();
}
self
Ok(self)
}

fn add_wire_encryption_settings(&mut self) -> &mut Self {
self.add("hadoop.rpc.protection", "privacy");
self
}
}

/// For a long time we tried using `_HOST` in principals, e.g. `jn/_HOST@REALM.COM`.
/// Turns out there are a lot of code paths that check the principal of the requester using a reverse lookup of the incoming IP address
/// and getting a different hostname than the principal has.
/// What ultimately killed this approach was
///
/// ```text
/// 2023-05-30 09:23:01,745 ERROR namenode.EditLogInputStream (EditLogFileInputStream.java:nextOpImpl(220)) - caught exception initializing https://hdfs-journalnode-default-1.hdfs-journalnode-default.kuttl-test-fine-rat.svc.cluster.local:8481/getJournal?jid=hdfs&segmentTxId=1&storageInfo=-65%3A595659877%3A1685437352616%3ACID-90c52400-5b07-49bf-bdbe-3469bbdc5ebb&inProgressOk=true
/// org.apache.hadoop.hdfs.server.common.HttpGetFailedException: Fetch of https://hdfs-journalnode-default-1.hdfs-journalnode-default.kuttl-test-fine-rat.svc.cluster.local:8481/getJournal?jid=hdfs&segmentTxId=1&storageInfo=-65%3A595659877%3A1685437352616%3ACID-90c52400-5b07-49bf-bdbe-3469bbdc5ebb&inProgressOk=true failed with status code 403
/// Response message:
/// Only Namenode and another JournalNode may access this servlet
/// ```
///
/// After we have switched to using the following principals everything worked without problems
fn principal_host_part(hdfs: &HdfsCluster) -> Result<String, Error> {
let hdfs_name = hdfs.name_any();
let hdfs_namespace = hdfs
.namespace_or_error()
.map_err(|_| Error::ObjectHasNoNamespace {
obj_ref: ObjectRef::from_obj(hdfs),
})?;
Ok(format!(
"{hdfs_name}.{hdfs_namespace}.svc.cluster.local@${{env.KERBEROS_REALM}}"
))
}