diff --git a/aks-node-controller/pkg/nodeconfigutils/utils.go b/aks-node-controller/pkg/nodeconfigutils/utils.go index 678a83a8c56..11a3ff9cc98 100644 --- a/aks-node-controller/pkg/nodeconfigutils/utils.go +++ b/aks-node-controller/pkg/nodeconfigutils/utils.go @@ -20,7 +20,10 @@ set -euo pipefail logger -t aks-boothook "boothook start $(date -Ins)" -mkdir -p /opt/azure/containers +mkdir -p /opt/azure/containers /var/lib/waagent + +touch /var/lib/waagent/experimental_skip_ready_report +chmod 0644 /var/lib/waagent/experimental_skip_ready_report cat <<'EOF' | base64 -d >/opt/azure/containers/aks-node-controller-config.json %s diff --git a/e2e/scenario_test.go b/e2e/scenario_test.go index 467134c4b8e..b40ccc840ff 100644 --- a/e2e/scenario_test.go +++ b/e2e/scenario_test.go @@ -506,8 +506,9 @@ func Test_Ubuntu2204_Scriptless(t *testing.T) { RunScenario(t, &Scenario{ Description: "tests that a new ubuntu 2204 node using self contained installer can be properly bootstrapped", Config: Config{ - Cluster: ClusterKubenet, - VHD: config.VHDUbuntu2204Gen2Containerd, + Cluster: ClusterKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, + UseCustomDataOnlyProvisioning: true, Validator: func(ctx context.Context, s *Scenario) { ValidateFileHasContent(ctx, s, "/var/log/azure/aks-node-controller.log", "aks-node-controller finished successfully") }, diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go index 6b05a5d5924..ee280781b7e 100644 --- a/e2e/test_helpers.go +++ b/e2e/test_helpers.go @@ -301,8 +301,10 @@ func prepareAKSNode(ctx context.Context, s *Scenario) (*ScenarioVM, error) { require.NoError(s.T, err, "create vmss %q, check %s for vm logs", s.Runtime.VMSSName, testDir(s.T)) } - err = getCustomScriptExtensionStatus(s, scenarioVM.VM) - require.NoError(s.T, err) + if !s.Config.UseCustomDataOnlyProvisioning { + err = getCustomScriptExtensionStatus(s, scenarioVM.VM) + require.NoError(s.T, err) + } if !s.Config.SkipDefaultValidation { vmssCreatedAt := time.Now() // Record the start time diff --git a/e2e/types.go b/e2e/types.go index 795e3722d09..dbc411f0a12 100644 --- a/e2e/types.go +++ b/e2e/types.go @@ -163,6 +163,11 @@ type Config struct { // AKSNodeConfigMutator if defined then aks-node-controller will be used to provision nodes AKSNodeConfigMutator func(*aksnodeconfigv1.Configuration) + // UseCustomDataOnlyProvisioning switches an AKSNodeConfig scenario to a CustomData-only E2E flow. + // It omits the VMSS Custom Script Extension and uses CustomData to run aks-node-controller provision + // directly during cloud-init instead. + UseCustomDataOnlyProvisioning bool + // VMConfigMutator is a function which mutates the base VMSS model according to the scenario's requirements VMConfigMutator func(*armcompute.VirtualMachineScaleSet) diff --git a/e2e/vmss.go b/e2e/vmss.go index 3fbc00c1c86..e736f99a116 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -95,7 +95,10 @@ func CustomDataWithHack(s *Scenario, binaryURL string) (string, error) { #!/bin/bash set -euo pipefail -mkdir -p /opt/azure/containers /opt/azure/bin +mkdir -p /opt/azure/containers /opt/azure/bin /var/lib/waagent + +touch /var/lib/waagent/experimental_skip_ready_report +chmod 0644 /var/lib/waagent/experimental_skip_ready_report cat <<'EOF' | base64 -d > /opt/azure/containers/aks-node-controller-config-hack.json %s diff --git a/parts/linux/cloud-init/artifacts/cse_install.sh b/parts/linux/cloud-init/artifacts/cse_install.sh index 3f7871292a9..92f9b4074ef 100755 --- a/parts/linux/cloud-init/artifacts/cse_install.sh +++ b/parts/linux/cloud-init/artifacts/cse_install.sh @@ -872,4 +872,13 @@ EOF systemctl restart nvidia-persistenced.service || exit 1 } +skipCloudInitReadyReport() { + local config_filepath="/etc/cloud/cloud.cfg.d/81_azure_skip_ready_report.cfg" + mkdir -p "$(dirname "${config_filepath}")" + cat <"${config_filepath}" +datasource: + Azure: + experimental_skip_ready_report: true +EOF +} #EOF diff --git a/parts/linux/cloud-init/artifacts/cse_start.sh b/parts/linux/cloud-init/artifacts/cse_start.sh index bc1fce5d7e3..cc94b7983f8 100644 --- a/parts/linux/cloud-init/artifacts/cse_start.sh +++ b/parts/linux/cloud-init/artifacts/cse_start.sh @@ -103,6 +103,15 @@ EVENT_JSON=$( jq -n \ ) echo ${EVENT_JSON} > ${EVENTS_LOGGING_DIR}${EVENTS_FILE_NAME}.json + +if [ -x /opt/azure/containers/report_ready.py ]; then + if [ "$EXIT_CODE" -eq 0 ]; then + python3 /opt/azure/containers/report_ready.py -v || echo "WARNING: Failed to report ready to Azure fabric" + else + python3 /opt/azure/containers/report_ready.py -v --failure --description "ExitCode: ${EXIT_CODE}, ${message_string}" || echo "WARNING: Failed to report failure to Azure fabric" + fi +fi + # force a log upload to the host after the provisioning script finishes # if we failed, wait for the upload to complete so that we don't remove # the VM before it finishes. if we succeeded, upload in the background diff --git a/parts/linux/cloud-init/artifacts/report_ready.py b/parts/linux/cloud-init/artifacts/report_ready.py new file mode 100644 index 00000000000..577cdc2d0f6 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/report_ready.py @@ -0,0 +1,458 @@ +#!/usr/bin/env python3 +"""Standalone script to report provisioning status to Azure fabric. + +When cloud-init's built-in ready report is skipped via +experimental_skip_ready_report, this script can be invoked to send the +health status signal to Azure wireserver at the appropriate time during +node provisioning. + +Protocol overview: + 1. Write provisioning status to Hyper-V KVP pool file + 2. GET http:///machine/?comp=goalstate -> GoalState XML + 3. Parse container_id, instance_id, incarnation from the XML + 4. POST http:///machine?comp=health <- Health report XML + +Usage: + report_ready.py [--endpoint ENDPOINT] [--retries N] [--retry-delay SECS] + report_ready.py --failure --description "CSE failed with exit code 42" + +The wireserver endpoint defaults to 168.63.129.16. +""" + +import argparse +import csv +import fcntl +import io +import logging +import os +import struct +import subprocess +import sys +import time +import xml.etree.ElementTree as ET +from datetime import datetime, timezone +from html import escape +from http.client import HTTPConnection, HTTPException +from textwrap import dedent + +LOG = logging.getLogger("report_ready") + +DEFAULT_WIRESERVER_ENDPOINT = "168.63.129.16" +DEFAULT_RETRIES = 3 +DEFAULT_RETRY_DELAY = 5 +DESCRIPTION_TRIM_LEN = 512 + +WIRESERVER_HEADERS = { + "x-ms-agent-name": "WALinuxAgent", + "x-ms-version": "2012-11-30", +} + +HEALTH_REPORT_XML = dedent( + """\ + + + {incarnation} + + {container_id} + + + {instance_id} + + {status} + {detail_subsection} + + + + + + """ +) + +HEALTH_DETAIL_XML = dedent( + """\ +
+ {substatus} + {description} +
""" +) + +# Hyper-V KVP constants matching cloud-init's protocol +KVP_POOL_FILE_GUEST = "/var/lib/hyperv/.kvp_pool_1" +KVP_KEY_SIZE = 512 +KVP_VALUE_SIZE = 2048 +KVP_RECORD_SIZE = KVP_KEY_SIZE + KVP_VALUE_SIZE +KVP_AZURE_MAX_VALUE_SIZE = 1024 +KVP_PROVISIONING_KEY = "PROVISIONING_REPORT" +AGENT_NAME = "AKS-CSE" + +# When this file exists, cloud-init is handling the ready report itself, +# so this script should not send a duplicate report to the wireserver. +REPORT_MARKER = "/var/lib/waagent/experimental_skip_ready_report" + + +# --------------------------------------------------------------------------- +# Hyper-V KVP helpers +# --------------------------------------------------------------------------- + + +def _encode_kvp_record(key: str, value: str) -> bytes: + """Encode a key-value pair into a fixed-size KVP binary record.""" + return struct.pack( + "%ds%ds" % (KVP_KEY_SIZE, KVP_VALUE_SIZE), + key.encode("utf-8"), + value.encode("utf-8"), + ) + + +def _append_kvp_record(record: bytes, kvp_file: str = KVP_POOL_FILE_GUEST) -> None: + """Append a binary KVP record to the pool file with file locking.""" + try: + with open(kvp_file, "ab") as f: + fcntl.flock(f, fcntl.LOCK_EX) + f.write(record) + f.flush() + fcntl.flock(f, fcntl.LOCK_UN) + except OSError as e: + LOG.warning("Failed to write KVP record: %s", e) + + +def _encode_report(fields: list) -> str: + """Encode report fields as pipe-delimited CSV, matching cloud-init format.""" + buf = io.StringIO() + csv.writer(buf, delimiter="|", quotechar="'", quoting=csv.QUOTE_MINIMAL).writerow(fields) + return buf.getvalue().rstrip() + + +def _get_vm_id() -> str: + """Read the VM ID from DMI system-uuid.""" + try: + with open("/sys/class/dmi/id/product_uuid", "r") as f: + return f.read().strip().lower() + except OSError: + pass + try: + result = subprocess.run( + ["dmidecode", "-s", "system-uuid"], + capture_output=True, text=True, timeout=5, + ) + if result.returncode == 0: + return result.stdout.strip().lower() + except Exception: + pass + return "00000000-0000-0000-0000-000000000000" + + +def write_provisioning_kvp(report: str) -> None: + """Write a provisioning report to the KVP pool file.""" + if len(report) >= KVP_AZURE_MAX_VALUE_SIZE: + report = report[:KVP_AZURE_MAX_VALUE_SIZE - 1] + record = _encode_kvp_record(KVP_PROVISIONING_KEY, report) + _append_kvp_record(record) + LOG.info("Wrote PROVISIONING_REPORT to KVP.") + + +def report_dmesg_to_kvp() -> None: + """Capture dmesg output and write it to KVP, matching cloud-init behavior.""" + try: + result = subprocess.run( + ["dmesg"], capture_output=True, timeout=10, + ) + dmesg = result.stdout[-KVP_AZURE_MAX_VALUE_SIZE:] if result.stdout else b"" + record = _encode_kvp_record("dmesg", dmesg.decode("utf-8", errors="replace")) + _append_kvp_record(record) + LOG.debug("Wrote dmesg to KVP.") + except Exception as e: + LOG.warning("Failed to dump dmesg to KVP: %s", e) + + +def kvp_report_success(vm_id: str) -> None: + """Write success provisioning report to KVP, matching cloud-init format.""" + report = _encode_report([ + "result=success", + f"agent={AGENT_NAME}", + f"timestamp={datetime.now(timezone.utc).isoformat()}", + f"vm_id={vm_id}", + ]) + write_provisioning_kvp(report) + + +def kvp_report_failure(description: str, vm_id: str) -> None: + """Write failure provisioning report to KVP, matching cloud-init format.""" + report = _encode_report([ + "result=failure", + f"agent={AGENT_NAME}", + f"timestamp={datetime.now(timezone.utc).isoformat()}", + f"vm_id={vm_id}", + f"description={description[:DESCRIPTION_TRIM_LEN]}", + ]) + write_provisioning_kvp(report) + + +# --------------------------------------------------------------------------- +# Wireserver HTTP helpers +# --------------------------------------------------------------------------- + + +def http_get(endpoint: str, path: str) -> bytes: + """Perform an HTTP GET against the wireserver.""" + conn = HTTPConnection(endpoint, timeout=30) + try: + conn.request("GET", path, headers=WIRESERVER_HEADERS) + resp = conn.getresponse() + if resp.status != 200: + raise RuntimeError( + f"GET {path} returned HTTP {resp.status}: {resp.reason}" + ) + return resp.read() + finally: + conn.close() + + +def http_post(endpoint: str, path: str, body: bytes) -> None: + """Perform an HTTP POST against the wireserver.""" + headers = { + **WIRESERVER_HEADERS, + "Content-Type": "text/xml; charset=utf-8", + } + conn = HTTPConnection(endpoint, timeout=30) + try: + conn.request("POST", path, body=body, headers=headers) + resp = conn.getresponse() + if resp.status not in (200, 201, 202): + raise RuntimeError( + f"POST {path} returned HTTP {resp.status}: {resp.reason}" + ) + finally: + conn.close() + + +def _xml_text(root: ET.Element, xpath: str) -> str: + """Extract text from an XML element, raising if missing.""" + el = root.find(xpath) + if el is None or el.text is None: + raise ValueError(f"Missing element at xpath: {xpath}") + return el.text + + +def fetch_goalstate(endpoint: str) -> dict: + """Fetch and parse the GoalState from wireserver. + + Returns dict with keys: container_id, instance_id, incarnation. + """ + raw = http_get(endpoint, "/machine/?comp=goalstate") + root = ET.fromstring(raw) # nosec B314 + + return { + "container_id": _xml_text(root, "./Container/ContainerId"), + "instance_id": _xml_text( + root, + "./Container/RoleInstanceList/RoleInstance/InstanceId", + ), + "incarnation": _xml_text(root, "./Incarnation"), + } + + +def build_health_report( + goalstate: dict, + status: str = "Ready", + substatus: str = None, + description: str = None, +) -> bytes: + """Build the health report XML for ready or failure.""" + detail_subsection = "" + if substatus is not None: + detail_subsection = HEALTH_DETAIL_XML.format( + substatus=escape(substatus), + description=escape( + (description or "")[:DESCRIPTION_TRIM_LEN] + ), + ) + + return HEALTH_REPORT_XML.format( + incarnation=escape(str(goalstate["incarnation"])), + container_id=escape(goalstate["container_id"]), + instance_id=escape(goalstate["instance_id"]), + status=escape(status), + detail_subsection=detail_subsection, + ).encode("utf-8") + + +def _send_report( + endpoint: str, + status: str, + substatus: str = None, + description: str = None, + retries: int = DEFAULT_RETRIES, + retry_delay: float = DEFAULT_RETRY_DELAY, +) -> None: + """Fetch GoalState and send a health report to Azure fabric.""" + last_err = None + for attempt in range(1, retries + 1): + try: + LOG.info( + "Fetching GoalState from %s (attempt %d/%d)", + endpoint, + attempt, + retries, + ) + goalstate = fetch_goalstate(endpoint) + LOG.info( + "GoalState: container=%s instance=%s incarnation=%s", + goalstate["container_id"], + goalstate["instance_id"], + goalstate["incarnation"], + ) + + document = build_health_report( + goalstate, + status=status, + substatus=substatus, + description=description, + ) + LOG.info("Sending %s health report to %s", status, endpoint) + http_post(endpoint, "/machine?comp=health", document) + LOG.info("Successfully reported %s to Azure fabric.", status) + return + except (HTTPException, OSError, RuntimeError, ValueError) as exc: + last_err = exc + LOG.warning( + "Attempt %d/%d failed: %s", attempt, retries, exc + ) + if attempt < retries: + time.sleep(retry_delay) + + LOG.error("Failed to report %s after %d attempts.", status, retries) + raise RuntimeError( + f"Failed to report {status} after {retries} attempts" + ) from last_err + + +def report_ready( + endpoint: str = DEFAULT_WIRESERVER_ENDPOINT, + retries: int = DEFAULT_RETRIES, + retry_delay: float = DEFAULT_RETRY_DELAY, +) -> None: + """Send a provisioning success (Ready) report to Azure fabric. + + Matches cloud-init's _report_ready flow: + 1. Write dmesg to KVP + 2. Write success PROVISIONING_REPORT to KVP + 3. Fetch GoalState and POST Ready health report to wireserver + + Skipped if the experimental_skip_ready_report marker exists, which + indicates cloud-init is handling the report itself. + """ + if not os.path.exists(REPORT_MARKER): + LOG.info( + "Skipping report_ready: marker file %s does not exist, " + "cloud-init is handling the ready report.", + REPORT_MARKER, + ) + return + vm_id = _get_vm_id() + report_dmesg_to_kvp() + kvp_report_success(vm_id) + _send_report( + endpoint=endpoint, + status="Ready", + retries=retries, + retry_delay=retry_delay, + ) + + +def report_failure( + description: str, + endpoint: str = DEFAULT_WIRESERVER_ENDPOINT, + retries: int = DEFAULT_RETRIES, + retry_delay: float = DEFAULT_RETRY_DELAY, +) -> None: + """Send a provisioning failure (NotReady) report to Azure fabric. + + Matches cloud-init's _report_failure flow: + 1. Write dmesg to KVP + 2. Write failure PROVISIONING_REPORT to KVP + 3. Fetch GoalState and POST NotReady health report to wireserver + + Skipped if the experimental_skip_ready_report marker exists. + """ + if not os.path.exists(REPORT_MARKER): + LOG.info( + "Skipping report_failure: marker file %s does not exist, " + "cloud-init is handling the ready report.", + REPORT_MARKER, + ) + return + vm_id = _get_vm_id() + report_dmesg_to_kvp() + kvp_report_failure(description, vm_id) + _send_report( + endpoint=endpoint, + status="NotReady", + substatus="ProvisioningFailed", + description=description, + retries=retries, + retry_delay=retry_delay, + ) + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Report provisioning status to Azure wireserver." + ) + parser.add_argument( + "--endpoint", + default=DEFAULT_WIRESERVER_ENDPOINT, + help=f"Wireserver endpoint (default: {DEFAULT_WIRESERVER_ENDPOINT})", + ) + parser.add_argument( + "--retries", + type=int, + default=DEFAULT_RETRIES, + help=f"Number of retry attempts (default: {DEFAULT_RETRIES})", + ) + parser.add_argument( + "--retry-delay", + type=float, + default=DEFAULT_RETRY_DELAY, + help=f"Delay between retries in seconds (default: {DEFAULT_RETRY_DELAY})", + ) + parser.add_argument( + "--verbose", "-v", action="store_true", help="Enable debug logging" + ) + parser.add_argument( + "--failure", action="store_true", + help="Report provisioning failure instead of success", + ) + parser.add_argument( + "--description", type=str, default="", + help="Failure description (used with --failure)", + ) + args = parser.parse_args() + + logging.basicConfig( + level=logging.DEBUG if args.verbose else logging.INFO, + format="%(asctime)s - %(name)s [%(levelname)s]: %(message)s", + ) + + try: + if args.failure: + report_failure( + description=args.description, + endpoint=args.endpoint, + retries=args.retries, + retry_delay=args.retry_delay, + ) + else: + report_ready( + endpoint=args.endpoint, + retries=args.retries, + retry_delay=args.retry_delay, + ) + return 0 + except RuntimeError: + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/vhdbuilder/packer/imagecustomizer/azlosguard/azlosguard.yml b/vhdbuilder/packer/imagecustomizer/azlosguard/azlosguard.yml index ca37e4f8444..ffb169afa02 100644 --- a/vhdbuilder/packer/imagecustomizer/azlosguard/azlosguard.yml +++ b/vhdbuilder/packer/imagecustomizer/azlosguard/azlosguard.yml @@ -124,6 +124,9 @@ os: - source: /AgentBaker/parts/linux/cloud-init/artifacts/cse_send_logs.py destination: /opt/azure/containers/provision_send_logs.py permissions: 744 + - source: /AgentBaker/parts/linux/cloud-init/artifacts/report_ready.py + destination: /opt/azure/containers/report_ready.py + permissions: 744 # private hosts - source: /AgentBaker/parts/linux/cloud-init/artifacts/reconcile-private-hosts.service destination: /etc/systemd/system/reconcile-private-hosts.service diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 0e60ef5a07e..c868399b81b 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -122,6 +122,15 @@ if ! isMarinerOrAzureLinux "$OS"; then disableNtpAndTimesyncdInstallChrony || exit 1 fi +skipCloudInitReadyReport || exit 1 + +if [ "$OS" = "$UBUNTU_OS_NAME" ]; then + install -m 0644 /home/packer/DataSourceAzure.py \ + /usr/lib/python3/dist-packages/cloudinit/sources/DataSourceAzure.py || exit 1 + python3 -m py_compile /usr/lib/python3/dist-packages/cloudinit/sources/DataSourceAzure.py || exit 1 + rm -f /home/packer/DataSourceAzure.py +fi + # ACL inherits Azure Linux behaviors but isMarinerOrAzureLinux returns false, # so these must be called separately (mirrored in the Mariner/AzureLinux block below). # Other Mariner functions are safe to skip for ACL: diff --git a/vhdbuilder/packer/packer_source.sh b/vhdbuilder/packer/packer_source.sh index c960d797a5c..a5ca2a1ce1c 100644 --- a/vhdbuilder/packer/packer_source.sh +++ b/vhdbuilder/packer/packer_source.sh @@ -156,6 +156,10 @@ copyPackerFiles() { CSE_SEND_DEST=/opt/azure/containers/provision_send_logs.py cpAndMode $CSE_SEND_SRC $CSE_SEND_DEST 0744 + REPORT_READY_SRC=/home/packer/report_ready.py + REPORT_READY_DEST=/opt/azure/containers/report_ready.py + cpAndMode $REPORT_READY_SRC $REPORT_READY_DEST 0744 + INIT_CUSTOM_CLOUD_SRC=/home/packer/init-aks-custom-cloud.sh INIT_CUSTOM_CLOUD_DEST=/opt/azure/containers/init-aks-custom-cloud.sh cpAndMode $INIT_CUSTOM_CLOUD_SRC $INIT_CUSTOM_CLOUD_DEST 0744 diff --git a/vhdbuilder/packer/ubuntu/cloudinit/DataSourceAzure.py b/vhdbuilder/packer/ubuntu/cloudinit/DataSourceAzure.py new file mode 100644 index 00000000000..299f2c781c5 --- /dev/null +++ b/vhdbuilder/packer/ubuntu/cloudinit/DataSourceAzure.py @@ -0,0 +1,2250 @@ +# Copyright (C) 2013 Canonical Ltd. +# +# Author: Scott Moser +# +# This file is part of cloud-init. See LICENSE file for license information. + +import base64 +import functools +import logging +import os +import os.path +import re +import socket +import warnings +import xml.etree.ElementTree as ET # nosec B405 +from enum import Enum +from pathlib import Path +from time import monotonic, sleep, time +from typing import Any, Dict, List, Optional, Tuple + +import requests + +from cloudinit import net, performance, sources, ssh_util, subp, util +from cloudinit.config import cc_mounts +from cloudinit.event import EventScope, EventType +from cloudinit.net import device_driver +from cloudinit.net.dhcp import ( + NoDHCPLeaseError, + NoDHCPLeaseInterfaceError, + NoDHCPLeaseMissingDhclientError, +) +from cloudinit.net.ephemeral import EphemeralDHCPv4, EphemeralIPv4Network +from cloudinit.reporting import events +from cloudinit.sources.azure import errors, identity, imds, kvp +from cloudinit.sources.helpers import netlink +from cloudinit.sources.helpers.azure import ( + DEFAULT_WIRESERVER_ENDPOINT, + NonAzureDataSource, + OvfEnvXml, + azure_ds_reporter, + azure_ds_telemetry_reporter, + build_minimal_ovf, + dhcp_log_cb, + get_boot_telemetry, + get_metadata_from_fabric, + get_system_info, + report_diagnostic_event, + report_dmesg_to_kvp, + report_failure_to_fabric, +) +from cloudinit.url_helper import UrlError + +try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=DeprecationWarning) + import crypt # pylint: disable=W4901 + + blowfish_hash: Any = functools.partial( + crypt.crypt, salt=f"$6${util.rand_str(strlen=16)}" + ) +except (ImportError, AttributeError): + try: + import passlib.hash + + blowfish_hash = passlib.hash.sha512_crypt.hash + except ImportError: + + def blowfish_hash(_): + """Raise when called so that importing this module doesn't throw + ImportError when ds_detect() returns false. In this case, crypt + and passlib are not needed. + """ + raise ImportError( + "crypt and passlib not found, missing dependency" + ) + + +LOG = logging.getLogger(__name__) + +DS_NAME = "Azure" +DEFAULT_METADATA = {"instance-id": "iid-AZURE-NODE"} + +# azure systems will always have a resource disk, and 66-azure-ephemeral.rules +# ensures that it gets linked to this path. +RESOURCE_DISK_PATH = "/dev/disk/cloud/azure_resource" +DEFAULT_FS = "ext4" +AGENT_SEED_DIR = "/var/lib/waagent" +DEFAULT_PROVISIONING_ISO_DEV = "/dev/sr0" + + +class PPSType(Enum): + NONE = "None" + OS_DISK = "PreprovisionedOSDisk" + RUNNING = "Running" + SAVABLE = "Savable" + UNKNOWN = "Unknown" + + +PLATFORM_ENTROPY_SOURCE: Optional[str] = "/sys/firmware/acpi/tables/OEM0" + +# List of static scripts and network config artifacts created by +# stock ubuntu supported images. +UBUNTU_EXTENDED_NETWORK_SCRIPTS = [ + "/etc/netplan/90-hotplug-azure.yaml", + "/usr/local/sbin/ephemeral_eth.sh", + "/etc/udev/rules.d/10-net-device-added.rules", + "/run/network/interfaces.ephemeral.d", +] + + +def find_storvscid_from_sysctl_pnpinfo(sysctl_out, deviceid): + # extract the 'X' from dev.storvsc.X. if deviceid matches + """ + dev.storvsc.1.%pnpinfo: + classid=32412632-86cb-44a2-9b5c-50d1417354f5 + deviceid=00000000-0001-8899-0000-000000000000 + """ + for line in sysctl_out.splitlines(): + if re.search(r"pnpinfo", line): + fields = line.split() + if len(fields) >= 3: + columns = fields[2].split("=") + if ( + len(columns) >= 2 + and columns[0] == "deviceid" + and columns[1].startswith(deviceid) + ): + comps = fields[0].split(".") + return comps[2] + return None + + +def find_busdev_from_disk(camcontrol_out, disk_drv): + # find the scbusX from 'camcontrol devlist -b' output + # if disk_drv matches the specified disk driver, i.e. blkvsc1 + """ + scbus0 on ata0 bus 0 + scbus1 on ata1 bus 0 + scbus2 on blkvsc0 bus 0 + scbus3 on blkvsc1 bus 0 + scbus4 on storvsc2 bus 0 + scbus5 on storvsc3 bus 0 + scbus-1 on xpt0 bus 0 + """ + for line in camcontrol_out.splitlines(): + if re.search(disk_drv, line): + items = line.split() + return items[0] + return None + + +def find_dev_from_busdev(camcontrol_out: str, busdev: str) -> Optional[str]: + # find the daX from 'camcontrol devlist' output + # if busdev matches the specified value, i.e. 'scbus2' + """ + at scbus1 target 0 lun 0 (cd0,pass0) + at scbus2 target 0 lun 0 (da0,pass1) + at scbus3 target 1 lun 0 (da1,pass2) + """ + for line in camcontrol_out.splitlines(): + if re.search(busdev, line): + items = line.split("(") + if len(items) == 2: + dev_pass = items[1].split(",") + return dev_pass[0] + return None + + +def normalize_mac_address(mac: str) -> str: + """Normalize mac address with colons and lower-case.""" + if len(mac) == 12: + mac = ":".join( + [mac[0:2], mac[2:4], mac[4:6], mac[6:8], mac[8:10], mac[10:12]] + ) + + return mac.lower() + + +@azure_ds_telemetry_reporter +def get_hv_netvsc_macs_normalized() -> List[str]: + """Get Hyper-V NICs as normalized MAC addresses.""" + return [ + normalize_mac_address(n[1]) + for n in net.get_interfaces() + if n[2] == "hv_netvsc" + ] + + +@azure_ds_telemetry_reporter +def determine_device_driver_for_mac(mac: str) -> Optional[str]: + """Determine the device driver to match on, if any.""" + drivers = [ + i[2] + for i in net.get_interfaces() + if mac == normalize_mac_address(i[1]) + ] + if "hv_netvsc" in drivers: + return "hv_netvsc" + + if len(drivers) == 1: + report_diagnostic_event( + "Assuming driver for interface with mac=%s drivers=%r" + % (mac, drivers), + logger_func=LOG.debug, + ) + return drivers[0] + + report_diagnostic_event( + "Unable to specify driver for interface with mac=%s drivers=%r" + % (mac, drivers), + logger_func=LOG.warning, + ) + return None + + +def execute_or_debug(cmd, fail_ret=None) -> str: + try: + return subp.subp(cmd).stdout # pyright: ignore + except subp.ProcessExecutionError: + LOG.debug("Failed to execute: %s", " ".join(cmd)) + return fail_ret + + +def get_dev_storvsc_sysctl(): + return execute_or_debug(["sysctl", "dev.storvsc"], fail_ret="") + + +def get_camcontrol_dev_bus(): + return execute_or_debug(["camcontrol", "devlist", "-b"]) + + +def get_camcontrol_dev(): + return execute_or_debug(["camcontrol", "devlist"]) + + +def get_resource_disk_on_freebsd(port_id) -> Optional[str]: + g0 = "00000000" + if port_id > 1: + g0 = "00000001" + port_id = port_id - 2 + g1 = "000" + str(port_id) + g0g1 = "{0}-{1}".format(g0, g1) + + # search 'X' from + # 'dev.storvsc.X.%pnpinfo: + # classid=32412632-86cb-44a2-9b5c-50d1417354f5 + # deviceid=00000000-0001-8899-0000-000000000000' + sysctl_out = get_dev_storvsc_sysctl() + + storvscid = find_storvscid_from_sysctl_pnpinfo(sysctl_out, g0g1) + if not storvscid: + LOG.debug("Fail to find storvsc id from sysctl") + return None + + camcontrol_b_out = get_camcontrol_dev_bus() + camcontrol_out = get_camcontrol_dev() + # try to find /dev/XX from 'blkvsc' device + blkvsc = "blkvsc{0}".format(storvscid) + scbusx = find_busdev_from_disk(camcontrol_b_out, blkvsc) + if scbusx: + devname = find_dev_from_busdev(camcontrol_out, scbusx) + if devname is None: + LOG.debug("Fail to find /dev/daX") + return None + return devname + # try to find /dev/XX from 'storvsc' device + storvsc = "storvsc{0}".format(storvscid) + scbusx = find_busdev_from_disk(camcontrol_b_out, storvsc) + if scbusx: + devname = find_dev_from_busdev(camcontrol_out, scbusx) + if devname is None: + LOG.debug("Fail to find /dev/daX") + return None + return devname + return None + + +# update the FreeBSD specific information +if util.is_FreeBSD(): + DEFAULT_FS = "freebsd-ufs" + res_disk = get_resource_disk_on_freebsd(1) + if res_disk is not None: + LOG.debug("resource disk is not None") + RESOURCE_DISK_PATH = "/dev/" + res_disk + else: + LOG.debug("resource disk is None") + # TODO Find where platform entropy data is surfaced + PLATFORM_ENTROPY_SOURCE = None + +BUILTIN_DS_CONFIG = { + "data_dir": AGENT_SEED_DIR, + "disk_aliases": {"ephemeral0": RESOURCE_DISK_PATH}, + "apply_network_config": True, # Use IMDS published network configuration + "apply_network_config_for_secondary_ips": True, # Configure secondary ips + "experimental_skip_ready_report": False, # Skip final ready report +} + +BUILTIN_CLOUD_EPHEMERAL_DISK_CONFIG = { + "disk_setup": { + "ephemeral0": { + "table_type": "gpt", + "layout": [100], + "overwrite": True, + }, + }, + "fs_setup": [{"filesystem": DEFAULT_FS, "device": "ephemeral0.1"}], +} + +DS_CFG_PATH = ["datasource", DS_NAME] +DS_CFG_KEY_PRESERVE_NTFS = "never_destroy_ntfs" + +# The redacted password fails to meet password complexity requirements +# so we can safely use this to mask/redact the password in the ovf-env.xml +DEF_PASSWD_REDACTION = "REDACTED" + + +class DataSourceAzure(sources.DataSource): + dsname = "Azure" + default_update_events = { + EventScope.NETWORK: { + EventType.BOOT_NEW_INSTANCE, + EventType.BOOT, + } + } + _negotiated = False + _metadata_imds = sources.UNSET + _ci_pkl_version = 1 + + def __init__(self, sys_cfg, distro, paths): + sources.DataSource.__init__(self, sys_cfg, distro, paths) + self.seed_dir = os.path.join(paths.seed_dir, "azure") + self.cfg = {} + self.seed = None + self.ds_cfg = util.mergemanydict( + [util.get_cfg_by_path(sys_cfg, DS_CFG_PATH, {}), BUILTIN_DS_CONFIG] + ) + self._iso_dev = None + self._network_config = None + self._ephemeral_dhcp_ctx: Optional[EphemeralDHCPv4] = None + self._reported_ready_marker_file = os.path.join( + paths.cloud_dir, "data", "reported_ready" + ) + self._route_configured_for_imds = False + self._route_configured_for_wireserver = False + self._system_uuid = None + self._vm_id = None + self._wireserver_endpoint = DEFAULT_WIRESERVER_ENDPOINT + + def _unpickle(self, ci_pkl_version: int) -> None: + super()._unpickle(ci_pkl_version) + + self._ephemeral_dhcp_ctx = None + self._iso_dev = None + self._reported_ready_marker_file = os.path.join( + self.paths.cloud_dir, "data", "reported_ready" + ) + self._route_configured_for_imds = False + self._route_configured_for_wireserver = False + self._system_uuid = None + self._vm_id = None + self._wireserver_endpoint = DEFAULT_WIRESERVER_ENDPOINT + + def __str__(self): + root = sources.DataSource.__str__(self) + return "%s [seed=%s]" % (root, self.seed) + + def _get_subplatform(self): + """Return the subplatform metadata source details.""" + if self.seed is None: + subplatform_type = "unknown" + elif self.seed.startswith("/dev"): + subplatform_type = "config-disk" + elif self.seed.lower() == "imds": + subplatform_type = "imds" + else: + subplatform_type = "seed-dir" + return "%s (%s)" % (subplatform_type, self.seed) + + @azure_ds_telemetry_reporter + def _check_if_primary(self, ephipv4: EphemeralIPv4Network) -> bool: + if not ephipv4.static_routes: + # Primary nics must contain routes. + return False + + routed_networks = [r[0].split("/")[0] for r in ephipv4.static_routes] + + # Expected to be true for all of Azure public cloud and future Azure + # Stack versions with IMDS capabilities, but false for existing ones. + self._route_configured_for_imds = "169.254.169.254" in routed_networks + + # Expected to be true for Azure public cloud and Azure Stack. + self._route_configured_for_wireserver = ( + self._wireserver_endpoint in routed_networks + ) + + return ( + self._route_configured_for_imds + or self._route_configured_for_wireserver + ) + + @azure_ds_telemetry_reporter + def _setup_ephemeral_networking( + self, + *, + iface: Optional[str] = None, + report_failure_if_not_primary: bool = True, + retry_sleep: int = 1, + timeout_minutes: int = 5, + ) -> bool: + """Setup ephemeral networking. + + Keep retrying DHCP up to specified number of minutes. This does + not kill dhclient, so the timeout in practice may be up to + timeout_minutes + the system-configured timeout for dhclient. + + :param timeout_minutes: Number of minutes to keep retrying for. + + :raises NoDHCPLeaseError: If unable to obtain DHCP lease. + + :returns: True if NIC is determined to be primary. + """ + if self._ephemeral_dhcp_ctx is not None: + raise RuntimeError( + "Bringing up networking when already configured." + ) + + self._ephemeral_dhcp_ctx = EphemeralDHCPv4( + self.distro, + iface=iface, + dhcp_log_func=dhcp_log_cb, + ) + + update_primary_nic = iface is None + lease: Optional[Dict[str, Any]] = None + start_time = monotonic() + deadline = start_time + timeout_minutes * 60 + with events.ReportEventStack( + name="obtain-dhcp-lease", + description="obtain dhcp lease", + parent=azure_ds_reporter, + ): + while lease is None: + try: + if update_primary_nic: + iface = find_primary_nic() + + mac, driver = get_interface_details(iface) + + report_diagnostic_event( + "Bringing up ephemeral networking with " + "iface=%s mac=%s driver=%s: %r" + % (iface, mac, driver, net.get_interfaces()), + logger_func=LOG.debug, + ) + + self._ephemeral_dhcp_ctx.iface = iface + lease = self._ephemeral_dhcp_ctx.obtain_lease() + except NoDHCPLeaseInterfaceError: + # Interface not found, continue after sleeping 1 second. + report_diagnostic_event( + "Interface not found for DHCP", logger_func=LOG.warning + ) + self._report_failure( + errors.ReportableErrorDhcpInterfaceNotFound( + duration=monotonic() - start_time + ), + host_only=True, + ) + except NoDHCPLeaseMissingDhclientError: + # No dhclient, no point in retrying. + report_diagnostic_event( + "dhclient executable not found", logger_func=LOG.error + ) + self._ephemeral_dhcp_ctx = None + raise + except NoDHCPLeaseError: + # Typical DHCP failure, continue after sleeping 1 second. + report_diagnostic_event( + "Failed to obtain DHCP lease " + "(iface=%s mac=%s driver=%s)" % (iface, mac, driver), + logger_func=LOG.error, + ) + self._report_failure( + errors.ReportableErrorDhcpLease( + driver=driver, + duration=monotonic() - start_time, + interface=iface, + mac_address=mac, + ), + host_only=True, + ) + except FileNotFoundError as error: + report_diagnostic_event( + "File not found during DHCP %r" % error, + logger_func=LOG.error, + ) + except subp.ProcessExecutionError as error: + # udevadm settle, ip link set dev eth0 up, etc. + report_diagnostic_event( + "Command failed: " + "cmd=%r stderr=%r stdout=%r exit_code=%s" + % ( + error.cmd, + error.stderr, + error.stdout, + error.exit_code, + ), + logger_func=LOG.error, + ) + + # Sleep before retrying, otherwise break if past deadline. + if lease is None and monotonic() + retry_sleep < deadline: + sleep(retry_sleep) + else: + break + + if lease is None: + self._ephemeral_dhcp_ctx = None + raise NoDHCPLeaseError() + + # Ensure iface is set. + iface = lease["interface"] + self._ephemeral_dhcp_ctx.iface = iface + + # Update wireserver IP from DHCP options. + if "unknown-245" in lease: + self._wireserver_endpoint = lease["unknown-245"] + + driver = device_driver(iface) + ephipv4 = self._ephemeral_dhcp_ctx._ephipv4 + if ephipv4 is None: + raise RuntimeError("dhcp context missing ephipv4") + + primary = self._check_if_primary(ephipv4) + report_diagnostic_event( + "Obtained DHCP lease on interface %r " + "(primary=%r driver=%r router=%r routes=%r lease=%r " + "imds_routed=%r wireserver_routed=%r)" + % ( + iface, + primary, + driver, + ephipv4.router, + ephipv4.static_routes, + lease, + self._route_configured_for_imds, + self._route_configured_for_wireserver, + ), + logger_func=LOG.debug, + ) + + if report_failure_if_not_primary and not primary: + self._report_failure( + errors.ReportableErrorDhcpOnNonPrimaryInterface( + interface=iface, + driver=driver, + router=ephipv4.router, + static_routes=ephipv4.static_routes, + lease=lease, + ), + host_only=True, + ) + return primary + + @azure_ds_telemetry_reporter + def _teardown_ephemeral_networking(self) -> None: + """Teardown ephemeral networking.""" + self._route_configured_for_imds = False + self._route_configured_for_wireserver = False + if self._ephemeral_dhcp_ctx is None: + return + + self._ephemeral_dhcp_ctx.clean_network() + self._ephemeral_dhcp_ctx = None + + def _is_ephemeral_networking_up(self) -> bool: + """Check if networking is configured.""" + return not ( + self._ephemeral_dhcp_ctx is None + or self._ephemeral_dhcp_ctx.lease is None + ) + + def _check_azure_proxy_agent_status(self) -> None: + """Check if azure-proxy-agent is ready for communication with WS/IMDS. + + If ProvisionGuestProxyAgent is true, query azure-proxy-agent status, + waiting up to 120 seconds for the proxy to negotiate with Wireserver + and configure an eBPF proxy. Once azure-proxy-agent is ready, + it will exit with code 0 and cloud-init can then expect to be able to + communicate with these services. + + Fail deployment if azure-proxy-agent is not found or otherwise returns + an error. + + For more information, check out: + https://github.com/azure/guestproxyagent + """ + try: + cmd = [ + "azure-proxy-agent", + "--status", + "--wait", + "120", + ] + out, err = subp.subp(cmd) + report_diagnostic_event( + "Executing %s resulted " + "in stderr=%r with stdout=%r" % (cmd, err, out), + logger_func=LOG.debug, + ) + except subp.ProcessExecutionError as error: + if isinstance(error.reason, FileNotFoundError): + LOG.error( + "Failed to activate Azure Guest Proxy Agent: " + "azure-proxy-agent not found" + ) + report_error = errors.ReportableErrorProxyAgentNotFound() + self._report_failure(report_error) + else: + report_diagnostic_event( + "Failed to activate Azure Guest Proxy Agent: " + "status check failed " + "cmd=%r stderr=%r stdout=%r exit_code=%s" + % ( + error.cmd, + error.stderr, + error.stdout, + error.exit_code, + ), + logger_func=LOG.error, + ) + reportable_error = ( + errors.ReportableErrorProxyAgentStatusFailure(error) + ) + self._report_failure(reportable_error) + + @azure_ds_telemetry_reporter + def crawl_metadata(self): + """Walk all instance metadata sources returning a dict on success. + + @return: A dictionary of any metadata content for this instance. + @raise: InvalidMetaDataException when the expected metadata service is + unavailable, broken or disabled. + """ + self._query_vm_id() + report_diagnostic_event( + "Azure VM ID: %s System UUID: %s" + % (self._vm_id, self._system_uuid), + logger_func=LOG.info, + ) + + crawled_data = {} + # azure removes/ejects the cdrom containing the ovf-env.xml + # file on reboot. So, in order to successfully reboot we + # need to look in the datadir and consider that valid + ddir = self.ds_cfg["data_dir"] + + # The order in which the candidates are inserted matters here, because + # it determines the value of ret. More specifically, the first one in + # the candidate list determines the path to take in order to get the + # metadata we need. + ovf_source = None + md = {"local-hostname": ""} + cfg = {"system_info": {"default_user": {"name": ""}}} + userdata_raw = "" + files = {} + + for src in list_possible_azure_ds(self.seed_dir, ddir): + try: + if src.startswith("/dev/"): + if util.is_FreeBSD(): + md, userdata_raw, cfg, files = util.mount_cb( + src, load_azure_ds_dir, mtype="udf" + ) + else: + md, userdata_raw, cfg, files = util.mount_cb( + src, load_azure_ds_dir + ) + # save the device for ejection later + self._iso_dev = src + else: + md, userdata_raw, cfg, files = load_azure_ds_dir(src) + + ovf_source = src + report_diagnostic_event( + "Found provisioning metadata in %s" % ovf_source, + logger_func=LOG.debug, + ) + break + except NonAzureDataSource: + report_diagnostic_event( + "Did not find Azure data source in %s" % src, + logger_func=LOG.debug, + ) + continue + except util.MountFailedError: + report_diagnostic_event( + "%s was not mountable" % src, logger_func=LOG.debug + ) + continue + else: + msg = ( + "Unable to find provisioning media, falling back to IMDS " + "metadata. Be aware that IMDS metadata does not support " + "admin passwords or custom-data (user-data only)." + ) + report_diagnostic_event(msg, logger_func=LOG.warning) + + # If we read OVF from attached media, we are provisioning. If OVF + # is not found, we are probably provisioning on a system which does + # not have UDF support. In either case, require IMDS metadata. + # If we require IMDS metadata, try harder to obtain networking, waiting + # for at least 20 minutes. Otherwise only wait 5 minutes. + requires_imds_metadata = bool(self._iso_dev) or ovf_source is None + timeout_minutes = 20 if requires_imds_metadata else 5 + try: + self._setup_ephemeral_networking(timeout_minutes=timeout_minutes) + except NoDHCPLeaseError: + pass + + imds_md = {} + if self._is_ephemeral_networking_up(): + # check if azure-proxy-agent is enabled in the ovf-env.xml file. + # azure-proxy-agent feature is opt-in and disabled by default. + if cfg.get("ProvisionGuestProxyAgent"): + self._check_azure_proxy_agent_status() + + imds_md = self.get_metadata_from_imds(report_failure=True) + + if not imds_md and ovf_source is None: + msg = "No OVF or IMDS available" + report_diagnostic_event(msg) + raise sources.InvalidMetaDataException(msg) + + # Refresh PPS type using metadata. + pps_type = self._determine_pps_type(cfg, imds_md) + if pps_type != PPSType.NONE: + if util.is_FreeBSD(): + msg = "Free BSD is not supported for PPS VMs" + report_diagnostic_event(msg, logger_func=LOG.error) + raise sources.InvalidMetaDataException(msg) + + # Networking is a hard requirement for source PPS, fail without it. + if not self._is_ephemeral_networking_up(): + msg = "DHCP failed while in source PPS" + report_diagnostic_event(msg, logger_func=LOG.error) + raise sources.InvalidMetaDataException(msg) + + if pps_type == PPSType.RUNNING: + self._wait_for_pps_running_reuse() + elif pps_type == PPSType.SAVABLE: + self._wait_for_pps_savable_reuse() + elif pps_type == PPSType.OS_DISK: + self._wait_for_pps_os_disk_shutdown() + else: + self._wait_for_pps_unknown_reuse() + + md, userdata_raw, cfg, files = self._reprovision() + if cfg.get("ProvisionGuestProxyAgent"): + self._check_azure_proxy_agent_status() + + # fetch metadata again as it has changed after reprovisioning + imds_md = self.get_metadata_from_imds(report_failure=True) + + # validate imds pps metadata + imds_ppstype = self._ppstype_from_imds(imds_md) + if imds_ppstype not in (None, PPSType.NONE.value): + self._report_failure( + errors.ReportableErrorImdsInvalidMetadata( + key="extended.compute.ppsType", value=imds_ppstype + ) + ) + + # Report errors if IMDS network configuration is missing data. + self.validate_imds_network_metadata(imds_md=imds_md) + + self.seed = ovf_source or "IMDS" + crawled_data.update( + { + "cfg": cfg, + "files": files, + "metadata": util.mergemanydict([md, {"imds": imds_md}]), + "userdata_raw": userdata_raw, + } + ) + imds_username = _username_from_imds(imds_md) + imds_hostname = _hostname_from_imds(imds_md) + imds_disable_password = _disable_password_from_imds(imds_md) + if imds_username: + LOG.debug("Username retrieved from IMDS: %s", imds_username) + cfg["system_info"]["default_user"]["name"] = imds_username + if imds_hostname: + LOG.debug("Hostname retrieved from IMDS: %s", imds_hostname) + crawled_data["metadata"]["local-hostname"] = imds_hostname + if imds_disable_password is not None: + LOG.debug( + "Disable password retrieved from IMDS: %s", + imds_disable_password, + ) + crawled_data["cfg"]["ssh_pwauth"] = not imds_disable_password + + if self.seed == "IMDS" and not crawled_data["files"]: + try: + contents = build_minimal_ovf( + username=imds_username, + hostname=imds_hostname, + disable_ssh_password_auth=imds_disable_password, + ) + crawled_data["files"] = {"ovf-env.xml": contents} + except Exception as e: + report_diagnostic_event( + "Failed to construct OVF from IMDS data %s" % e, + logger_func=LOG.debug, + ) + + # only use userdata from imds if OVF did not provide custom data + # userdata provided by IMDS is always base64 encoded + if not userdata_raw: + imds_userdata = _userdata_from_imds(imds_md) + if imds_userdata: + LOG.debug("Retrieved userdata from IMDS") + try: + crawled_data["userdata_raw"] = base64.b64decode( + "".join(imds_userdata.split()) + ) + except Exception: + report_diagnostic_event( + "Bad userdata in IMDS", logger_func=LOG.warning + ) + + if ovf_source == ddir: + report_diagnostic_event( + "using files cached in %s" % ddir, logger_func=LOG.debug + ) + + seed = _get_random_seed() + if seed: + crawled_data["metadata"]["random_seed"] = seed + crawled_data["metadata"]["instance-id"] = self._iid() + + if self._negotiated is False and self._is_ephemeral_networking_up(): + if self.ds_cfg.get("experimental_skip_ready_report", False): + LOG.debug( + "Skipping final health report as " + "experimental_skip_ready_report is enabled." + ) + else: + # Report ready and fetch public-keys from Wireserver, + # if required. + pubkey_info = self._determine_wireserver_pubkey_info( + cfg=cfg, imds_md=imds_md + ) + try: + ssh_keys = self._report_ready(pubkey_info=pubkey_info) + except Exception: + # Failed to report ready, but continue with best effort. + pass + else: + LOG.debug("negotiating returned %s", ssh_keys) + if ssh_keys: + crawled_data["metadata"]["public-keys"] = ssh_keys + + self._cleanup_markers() + + return crawled_data + + @azure_ds_telemetry_reporter + def get_metadata_from_imds(self, report_failure: bool) -> Dict: + start_time = monotonic() + retry_deadline = start_time + 300 + + # As a temporary workaround to support Azure Stack implementations + # which may not enable IMDS, limit connection errors to 11. + if not self._route_configured_for_imds: + max_connection_errors = 11 + else: + max_connection_errors = None + + error_string: Optional[str] = None + error_report: Optional[errors.ReportableError] = None + try: + return imds.fetch_metadata_with_api_fallback( + max_connection_errors=max_connection_errors, + retry_deadline=retry_deadline, + ) + except UrlError as error: + error_string = str(error) + duration = monotonic() - start_time + error_report = errors.ReportableErrorImdsUrlError( + exception=error, duration=duration + ) + + # As a temporary workaround to support Azure Stack implementations + # which may not enable IMDS, don't report connection errors to + # wireserver if route is not configured. + if not self._route_configured_for_imds and isinstance( + error.cause, requests.ConnectionError + ): + report_failure = False + except ValueError as error: + error_string = str(error) + error_report = errors.ReportableErrorImdsMetadataParsingException( + exception=error + ) + + self._report_failure(error_report, host_only=not report_failure) + report_diagnostic_event( + "Ignoring IMDS metadata due to: %s" % error_string, + logger_func=LOG.warning, + ) + return {} + + def clear_cached_attrs(self, attr_defaults=()): + """Reset any cached class attributes to defaults.""" + super(DataSourceAzure, self).clear_cached_attrs(attr_defaults) + self._metadata_imds = sources.UNSET + + @azure_ds_telemetry_reporter + def ds_detect(self): + """Check platform environment to report if this datasource may + run. + """ + chassis_tag = identity.ChassisAssetTag.query_system() + if chassis_tag is not None: + return True + + # If no valid chassis tag, check for seeded ovf-env.xml. + if self.seed_dir is None: + return False + + return Path(self.seed_dir, "ovf-env.xml").exists() + + @azure_ds_telemetry_reporter + def _get_data(self): + """Crawl and process datasource metadata caching metadata as attrs. + + @return: True on success, False on error, invalid or disabled + datasource. + """ + try: + get_boot_telemetry() + except Exception as e: + LOG.warning("Failed to get boot telemetry: %s", e) + + try: + get_system_info() + except Exception as e: + LOG.warning("Failed to get system information: %s", e) + + try: + crawled_data = self.crawl_metadata() + except errors.ReportableError as error: + self._report_failure(error) + return False + except Exception as error: + reportable_error = errors.ReportableErrorUnhandledException(error) + self._report_failure(reportable_error) + return False + finally: + self._teardown_ephemeral_networking() + + if ( + self.distro + and self.distro.name == "ubuntu" + and self.ds_cfg.get("apply_network_config") + ): + maybe_remove_ubuntu_network_config_scripts() + + # Process crawled data and augment with various config defaults + + # Only merge in default cloud config related to the ephemeral disk + # if the ephemeral disk exists + devpath = RESOURCE_DISK_PATH + if os.path.exists(devpath): + report_diagnostic_event( + "Ephemeral resource disk '%s' exists. " + "Merging default Azure cloud ephemeral disk configs." + % devpath, + logger_func=LOG.debug, + ) + self.cfg = util.mergemanydict( + [crawled_data["cfg"], BUILTIN_CLOUD_EPHEMERAL_DISK_CONFIG] + ) + else: + report_diagnostic_event( + "Ephemeral resource disk '%s' does not exist. " + "Not merging default Azure cloud ephemeral disk configs." + % devpath, + logger_func=LOG.debug, + ) + self.cfg = crawled_data["cfg"] + + self._metadata_imds = crawled_data["metadata"]["imds"] + self.metadata = util.mergemanydict( + [crawled_data["metadata"], DEFAULT_METADATA] + ) + self.userdata_raw = crawled_data["userdata_raw"] + + # walinux agent writes files world readable, but expects + # the directory to be protected. + write_files( + self.ds_cfg["data_dir"], crawled_data["files"], dirmode=0o700 + ) + return True + + def get_instance_id(self): + if not self.metadata or "instance-id" not in self.metadata: + return self._iid() + return str(self.metadata["instance-id"]) + + def device_name_to_device(self, name): + return self.ds_cfg["disk_aliases"].get(name) + + @azure_ds_telemetry_reporter + def get_public_ssh_keys(self) -> List[str]: + """ + Retrieve public SSH keys. + """ + try: + return self._get_public_keys_from_imds(self.metadata["imds"]) + except (KeyError, ValueError): + pass + + return self._get_public_keys_from_ovf() + + def _get_public_keys_from_imds(self, imds_md: dict) -> List[str]: + """Get SSH keys from IMDS metadata. + + :raises KeyError: if IMDS metadata is malformed/missing. + :raises ValueError: if key format is not supported. + + :returns: List of keys. + """ + try: + ssh_keys = [ + public_key["keyData"] + for public_key in imds_md["compute"]["publicKeys"] + ] + except KeyError: + log_msg = "No SSH keys found in IMDS metadata" + report_diagnostic_event(log_msg, logger_func=LOG.debug) + raise + + if any(not _key_is_openssh_formatted(key=key) for key in ssh_keys): + log_msg = "Key(s) not in OpenSSH format" + report_diagnostic_event(log_msg, logger_func=LOG.debug) + raise ValueError(log_msg) + + log_msg = "Retrieved {} keys from IMDS".format(len(ssh_keys)) + report_diagnostic_event(log_msg, logger_func=LOG.debug) + return ssh_keys + + def _get_public_keys_from_ovf(self) -> List[str]: + """Get SSH keys that were fetched from wireserver. + + :returns: List of keys. + """ + ssh_keys = [] + try: + ssh_keys = self.metadata["public-keys"] + log_msg = "Retrieved {} keys from OVF".format(len(ssh_keys)) + report_diagnostic_event(log_msg, logger_func=LOG.debug) + except KeyError: + log_msg = "No keys available from OVF" + report_diagnostic_event(log_msg, logger_func=LOG.debug) + + return ssh_keys + + def get_config_obj(self): + return self.cfg + + def check_instance_id(self, sys_cfg): + # quickly (local check only) if self.instance_id is still valid + return sources.instance_id_matches_system_uuid(self.get_instance_id()) + + def _query_vm_id(self): + """Query the system UUID and VM IDs, if needed. + + They are initialized to None, check only if they are unset. + + Raise as reportable error on failure. + """ + if not self._system_uuid: + try: + self._system_uuid = identity.query_system_uuid() + except RuntimeError as error: + raise errors.ReportableErrorVmIdentification(exception=error) + + if not self._vm_id: + try: + self._vm_id = identity.convert_system_uuid_to_vm_id( + self._system_uuid + ) + except ValueError as error: + raise errors.ReportableErrorVmIdentification( + exception=error, system_uuid=self._system_uuid + ) + + def _iid(self, previous=None): + self._query_vm_id() + + prev_iid_path = os.path.join( + self.paths.get_cpath("data"), "instance-id" + ) + if os.path.exists(prev_iid_path): + previous = util.load_text_file(prev_iid_path).strip() + swapped_id = identity.byte_swap_system_uuid(self._system_uuid) + + # Older kernels than 4.15 will have UPPERCASE product_uuid. + # We don't want Azure to react to an UPPER/lower difference as + # a new instance id as it rewrites SSH host keys. + # LP: #1835584 + if previous.lower() in [self._system_uuid, swapped_id]: + return previous + return self._system_uuid + + @azure_ds_telemetry_reporter + def _wait_for_nic_detach(self, nl_sock): + """Use the netlink socket provided to wait for nic detach event. + NOTE: The function doesn't close the socket. The caller owns closing + the socket and disposing it safely. + """ + try: + ifname = None + + # Preprovisioned VM will only have one NIC, and it gets + # detached immediately after deployment. + with events.ReportEventStack( + name="wait-for-nic-detach", + description="wait for nic detach", + parent=azure_ds_reporter, + ): + ifname = netlink.wait_for_nic_detach_event(nl_sock) + if ifname is None: + msg = ( + "Preprovisioned nic not detached as expected. " + "Proceeding without failing." + ) + report_diagnostic_event(msg, logger_func=LOG.warning) + else: + report_diagnostic_event( + "The preprovisioned nic %s is detached" % ifname, + logger_func=LOG.debug, + ) + except AssertionError as error: + report_diagnostic_event(str(error), logger_func=LOG.error) + raise + + @azure_ds_telemetry_reporter + def wait_for_link_up( + self, ifname: str, retries: int = 100, retry_sleep: float = 0.1 + ): + for i in range(retries): + if self.distro.networking.try_set_link_up(ifname): + report_diagnostic_event( + "The link %s is up." % ifname, logger_func=LOG.info + ) + break + + if (i + 1) < retries: + sleep(retry_sleep) + else: + report_diagnostic_event( + "The link %s is not up after %f seconds, continuing anyways." + % (ifname, retries * retry_sleep), + logger_func=LOG.info, + ) + + @azure_ds_telemetry_reporter + def _create_report_ready_marker(self): + path = self._reported_ready_marker_file + LOG.info("Creating a marker file to report ready: %s", path) + util.write_file( + path, "{pid}: {time}\n".format(pid=os.getpid(), time=time()) + ) + report_diagnostic_event( + "Successfully created reported ready marker file " + "while in the preprovisioning pool.", + logger_func=LOG.debug, + ) + + @azure_ds_telemetry_reporter + def _report_ready_for_pps( + self, + *, + create_marker: bool = True, + expect_url_error: bool = False, + ) -> None: + """Report ready for PPS, creating the marker file upon completion. + + :raises sources.InvalidMetaDataException: On error reporting ready. + """ + try: + self._report_ready() + except Exception as error: + # Ignore HTTP failures for Savable PPS as the call may appear to + # fail if the network interface is unplugged or the VM is + # suspended before we process the response. Worst case scenario + # is that we failed to report ready for source PPS and this VM + # will be discarded shortly, no harm done. + if expect_url_error and isinstance(error, UrlError): + report_diagnostic_event( + "Ignoring http call failure, it was expected.", + logger_func=LOG.debug, + ) + # The iso was ejected prior to reporting ready. + self._iso_dev = None + else: + msg = ( + "Failed reporting ready while in the preprovisioning pool." + ) + report_diagnostic_event(msg, logger_func=LOG.error) + raise sources.InvalidMetaDataException(msg) from error + + # Reset flag as we will need to report ready again for re-use. + self._negotiated = False + + if create_marker: + self._create_report_ready_marker() + + @azure_ds_telemetry_reporter + def _wait_for_hot_attached_primary_nic(self, nl_sock): + """Wait until the primary nic for the vm is hot-attached.""" + LOG.info("Waiting for primary nic to be hot-attached") + try: + nics_found = [] + primary_nic_found = False + + # Wait for netlink nic attach events. After the first nic is + # attached, we are already in the customer vm deployment path and + # so everything from then on should happen fast and avoid + # unnecessary delays wherever possible. + while True: + ifname = None + with events.ReportEventStack( + name="wait-for-nic-attach", + description=( + "wait for nic attach after %d nics have been attached" + % len(nics_found) + ), + parent=azure_ds_reporter, + ): + ifname = netlink.wait_for_nic_attach_event( + nl_sock, nics_found + ) + + # wait_for_nic_attach_event guarantees that ifname it not None + nics_found.append(ifname) + report_diagnostic_event( + "Detected nic %s attached." % ifname, logger_func=LOG.info + ) + + # Attempt to bring the interface's operating state to + # UP in case it is not already. + self.wait_for_link_up(ifname) + + # If primary nic is not found, check if this is it. The + # platform will attach the primary nic first so we + # won't be in primary_nic_found = false state for long. + if not primary_nic_found: + LOG.info("Checking if %s is the primary nic", ifname) + primary_nic_found = self._setup_ephemeral_networking( + iface=ifname, + timeout_minutes=20, + report_failure_if_not_primary=False, + ) + + # Exit criteria: check if we've discovered primary nic + if primary_nic_found: + LOG.info("Found primary nic for this VM.") + break + else: + self._teardown_ephemeral_networking() + + except AssertionError as error: + report_diagnostic_event(str(error), logger_func=LOG.error) + + @azure_ds_telemetry_reporter + def _create_bound_netlink_socket(self) -> socket.socket: + try: + return netlink.create_bound_netlink_socket() + except netlink.NetlinkCreateSocketError as error: + report_diagnostic_event( + f"Failed to create netlink socket: {error}", + logger_func=LOG.error, + ) + raise + + @azure_ds_telemetry_reporter + def _wait_for_pps_os_disk_shutdown(self): + """Report ready and wait for host to initiate shutdown.""" + self._report_ready_for_pps(create_marker=False) + + report_diagnostic_event( + "Waiting for host to shutdown VM...", + logger_func=LOG.info, + ) + sleep(31536000) + raise errors.ReportableErrorOsDiskPpsFailure() + + @azure_ds_telemetry_reporter + def _wait_for_pps_running_reuse(self) -> None: + """Report ready and wait for nic link to switch upon re-use.""" + nl_sock = self._create_bound_netlink_socket() + + try: + if ( + self._ephemeral_dhcp_ctx is None + or self._ephemeral_dhcp_ctx.iface is None + ): + raise RuntimeError("missing ephemeral context") + + iface = self._ephemeral_dhcp_ctx.iface + self._report_ready_for_pps() + + LOG.debug( + "Wait for vnetswitch to happen on %s", + iface, + ) + with events.ReportEventStack( + name="wait-for-media-disconnect-connect", + description="wait for vnet switch", + parent=azure_ds_reporter, + ): + try: + netlink.wait_for_media_disconnect_connect(nl_sock, iface) + except AssertionError as e: + report_diagnostic_event( + "Error while waiting for vnet switch: %s" % e, + logger_func=LOG.error, + ) + finally: + nl_sock.close() + + # Teardown source PPS network configuration. + self._teardown_ephemeral_networking() + + @azure_ds_telemetry_reporter + def _wait_for_pps_savable_reuse(self): + """Report ready and wait for nic(s) to be hot-attached upon re-use.""" + nl_sock = self._create_bound_netlink_socket() + + try: + self._report_ready_for_pps(expect_url_error=True) + try: + self._teardown_ephemeral_networking() + except subp.ProcessExecutionError as e: + report_diagnostic_event( + "Ignoring failure while tearing down networking, " + "NIC was likely unplugged: %r" % e, + logger_func=LOG.info, + ) + self._ephemeral_dhcp_ctx = None + + self._wait_for_nic_detach(nl_sock) + self._wait_for_hot_attached_primary_nic(nl_sock) + finally: + nl_sock.close() + + @azure_ds_telemetry_reporter + def _wait_for_pps_unknown_reuse(self): + """Report ready if needed for unknown/recovery PPS.""" + if os.path.isfile(self._reported_ready_marker_file): + # Already reported ready, nothing to do. + return + + self._report_ready_for_pps() + + # Teardown source PPS network configuration. + self._teardown_ephemeral_networking() + + @azure_ds_telemetry_reporter + def _poll_imds(self) -> bytes: + """Poll IMDs for reprovisiondata XML document data.""" + dhcp_attempts = 0 + reprovision_data: Optional[bytes] = None + while not reprovision_data: + if not self._is_ephemeral_networking_up(): + dhcp_attempts += 1 + try: + self._setup_ephemeral_networking(timeout_minutes=5) + except NoDHCPLeaseError: + continue + + with events.ReportEventStack( + name="get-reprovision-data-from-imds", + description="get reprovision data from imds", + parent=azure_ds_reporter, + ): + try: + reprovision_data = imds.fetch_reprovision_data() + except UrlError: + self._teardown_ephemeral_networking() + continue + + report_diagnostic_event( + "attempted dhcp %d times after reuse" % dhcp_attempts, + logger_func=LOG.debug, + ) + return reprovision_data + + @azure_ds_telemetry_reporter + def _report_failure( + self, error: errors.ReportableError, host_only: bool = False + ) -> bool: + """Report failure to Azure host and fabric. + + For errors that may be recoverable (e.g. DHCP), host_only provides a + mechanism to report the failure that can be updated later with success. + DHCP will not be attempted if host_only=True and networking is down. + + @param error: Error to report. + @param host_only: Only report to host (error may be recoverable). + @return: The success status of sending the failure signal. + """ + encoded_report = error.as_encoded_report(vm_id=self._vm_id) + report_diagnostic_event( + f"Azure datasource failure occurred: {encoded_report}", + logger_func=LOG.error, + ) + report_dmesg_to_kvp() + reported = kvp.report_via_kvp(encoded_report) + if host_only: + return reported + + if self._is_ephemeral_networking_up(): + try: + report_diagnostic_event( + "Using cached ephemeral dhcp context " + "to report failure to Azure", + logger_func=LOG.debug, + ) + report_failure_to_fabric( + endpoint=self._wireserver_endpoint, + encoded_report=encoded_report, + ) + self._negotiated = True + return True + except Exception as e: + report_diagnostic_event( + "Failed to report failure using " + "cached ephemeral dhcp context: %s" % e, + logger_func=LOG.error, + ) + + try: + report_diagnostic_event( + "Using new ephemeral dhcp to report failure to Azure", + logger_func=LOG.debug, + ) + self._teardown_ephemeral_networking() + try: + self._setup_ephemeral_networking(timeout_minutes=20) + except NoDHCPLeaseError: + # Reporting failure will fail, but it will emit telemetry. + pass + report_failure_to_fabric( + endpoint=self._wireserver_endpoint, + encoded_report=encoded_report, + ) + self._negotiated = True + return True + except Exception as e: + report_diagnostic_event( + "Failed to report failure using new ephemeral dhcp: %s" % e, + logger_func=LOG.debug, + ) + + return False + + @azure_ds_telemetry_reporter + def _report_ready( + self, *, pubkey_info: Optional[List[str]] = None + ) -> Optional[List[str]]: + """Tells the fabric provisioning has completed. + + :param pubkey_info: Fingerprints of keys to request from Wireserver. + + :raises Exception: if failed to report. + + :returns: List of SSH keys, if requested. + """ + report_dmesg_to_kvp() + kvp.report_success_to_host(vm_id=self._vm_id) + + try: + data = get_metadata_from_fabric( + endpoint=self._wireserver_endpoint, + distro=self.distro, + iso_dev=self._iso_dev, + pubkey_info=pubkey_info, + ) + except Exception as e: + report_diagnostic_event( + "Error communicating with Azure fabric; You may experience " + "connectivity issues: %s" % e, + logger_func=LOG.warning, + ) + raise + + # Reporting ready ejected OVF media, no need to do so again. + self._iso_dev = None + self._negotiated = True + return data + + def _ppstype_from_imds(self, imds_md: dict) -> Optional[str]: + try: + return imds_md["extended"]["compute"]["ppsType"] + except Exception as e: + report_diagnostic_event( + "Could not retrieve pps configuration from IMDS: %s" % e, + logger_func=LOG.debug, + ) + return None + + def _determine_pps_type(self, ovf_cfg: dict, imds_md: dict) -> PPSType: + """Determine PPS type using OVF, IMDS data, and reprovision marker.""" + if os.path.isfile(self._reported_ready_marker_file): + pps_type = PPSType.UNKNOWN + elif ( + ovf_cfg.get("PreprovisionedVMType", None) == PPSType.SAVABLE.value + or self._ppstype_from_imds(imds_md) == PPSType.SAVABLE.value + ): + pps_type = PPSType.SAVABLE + elif ( + ovf_cfg.get("PreprovisionedVMType", None) == PPSType.OS_DISK.value + or self._ppstype_from_imds(imds_md) == PPSType.OS_DISK.value + ): + pps_type = PPSType.OS_DISK + elif ( + ovf_cfg.get("PreprovisionedVm") is True + or ovf_cfg.get("PreprovisionedVMType", None) + == PPSType.RUNNING.value + or self._ppstype_from_imds(imds_md) == PPSType.RUNNING.value + ): + pps_type = PPSType.RUNNING + else: + pps_type = PPSType.NONE + + report_diagnostic_event( + "PPS type: %s" % pps_type.value, logger_func=LOG.info + ) + return pps_type + + @azure_ds_telemetry_reporter + def _reprovision(self): + """Initiate the reprovisioning workflow. + + Ephemeral networking is up upon successful reprovisioning. + """ + contents = self._poll_imds() + with events.ReportEventStack( + name="reprovisioning-read-azure-ovf", + description="read azure ovf during reprovisioning", + parent=azure_ds_reporter, + ): + md, ud, cfg = read_azure_ovf(contents) + return (md, ud, cfg, {"ovf-env.xml": contents}) + + @azure_ds_telemetry_reporter + def _determine_wireserver_pubkey_info( + self, *, cfg: dict, imds_md: dict + ) -> Optional[List[str]]: + """Determine the fingerprints we need to retrieve from Wireserver. + + :return: List of keys to request from Wireserver, if any, else None. + """ + pubkey_info: Optional[List[str]] = None + try: + self._get_public_keys_from_imds(imds_md) + except (KeyError, ValueError): + pubkey_info = cfg.get("_pubkeys", None) + log_msg = "Retrieved {} fingerprints from OVF".format( + len(pubkey_info) if pubkey_info is not None else 0 + ) + report_diagnostic_event(log_msg, logger_func=LOG.debug) + return pubkey_info + + def _cleanup_markers(self): + """Cleanup any marker files.""" + util.del_file(self._reported_ready_marker_file) + + @azure_ds_telemetry_reporter + def activate(self, cfg, is_new_instance): + instance_dir = self.paths.get_ipath_cur() + try: + address_ephemeral_resize( + instance_dir, + is_new_instance=is_new_instance, + preserve_ntfs=self.ds_cfg.get(DS_CFG_KEY_PRESERVE_NTFS, False), + ) + finally: + report_dmesg_to_kvp() + return + + @property + def availability_zone(self): + return ( + self.metadata.get("imds", {}) + .get("compute", {}) + .get("platformFaultDomain") + ) + + @azure_ds_telemetry_reporter + def _generate_network_config(self): + """Generate network configuration according to configuration.""" + # Use IMDS network metadata, if configured. + if ( + self._metadata_imds + and self._metadata_imds != sources.UNSET + and self.ds_cfg.get("apply_network_config") + ): + try: + return generate_network_config_from_instance_network_metadata( + self._metadata_imds["network"], + apply_network_config_for_secondary_ips=self.ds_cfg.get( + "apply_network_config_for_secondary_ips" + ), + ) + except Exception as e: + LOG.error( + "Failed generating network config " + "from IMDS network metadata: %s", + str(e), + ) + + # Generate fallback configuration. + try: + return _generate_network_config_from_fallback_config() + except Exception as e: + LOG.error("Failed generating fallback network config: %s", str(e)) + + return {} + + @property + def network_config(self): + """Provide network configuration v2 dictionary.""" + # Use cached config, if present. + if self._network_config and self._network_config != sources.UNSET: + return self._network_config + + self._network_config = self._generate_network_config() + return self._network_config + + @property + def region(self): + return self.metadata.get("imds", {}).get("compute", {}).get("location") + + @azure_ds_telemetry_reporter + def validate_imds_network_metadata(self, imds_md: dict) -> bool: + """Validate IMDS network config and report telemetry for errors.""" + local_macs = get_hv_netvsc_macs_normalized() + + try: + network_config = imds_md["network"] + imds_macs = [ + normalize_mac_address(i["macAddress"]) + for i in network_config["interface"] + ] + except KeyError: + report_diagnostic_event( + "IMDS network metadata has incomplete configuration: %r" + % imds_md.get("network"), + logger_func=LOG.warning, + ) + return False + + missing_macs = [m for m in local_macs if m not in imds_macs] + if not missing_macs: + return True + + report_diagnostic_event( + "IMDS network metadata is missing configuration for NICs %r: %r" + % (missing_macs, network_config), + logger_func=LOG.warning, + ) + + if not self._ephemeral_dhcp_ctx or not self._ephemeral_dhcp_ctx.iface: + # No primary interface to check against. + return False + + primary_mac = net.get_interface_mac(self._ephemeral_dhcp_ctx.iface) + if not primary_mac or not isinstance(primary_mac, str): + # Unexpected data for primary interface. + return False + + primary_mac = normalize_mac_address(primary_mac) + if primary_mac in missing_macs: + report_diagnostic_event( + "IMDS network metadata is missing primary NIC %r: %r" + % (primary_mac, network_config), + logger_func=LOG.warning, + ) + + return False + + def _cleanup_resourcedisk_fstab(self): + """ + Remove resource disk entries from fstab, which are configured + by cloud-init i.e. lines containing "/dev/disk/cloud/azure_resource" + and cloudconfig comment. + """ + cc_mounts.cleanup_fstab(RESOURCE_DISK_PATH) + + def clean(self): + # Azure-specific cleanup logic for "cloud-init clean -c datasource" + self._cleanup_resourcedisk_fstab() + + +def _username_from_imds(imds_data): + try: + return imds_data["compute"]["osProfile"]["adminUsername"] + except KeyError: + return None + + +def _userdata_from_imds(imds_data): + try: + return imds_data["compute"]["userData"] + except KeyError: + return None + + +def _hostname_from_imds(imds_data): + try: + return imds_data["compute"]["osProfile"]["computerName"] + except KeyError: + return None + + +def _disable_password_from_imds(imds_data): + try: + return ( + imds_data["compute"]["osProfile"]["disablePasswordAuthentication"] + == "true" + ) + except KeyError: + return None + + +def _key_is_openssh_formatted(key): + """ + Validate whether or not the key is OpenSSH-formatted. + """ + # See https://bugs.launchpad.net/cloud-init/+bug/1910835 + if "\r\n" in key.strip(): + return False + + parser = ssh_util.AuthKeyLineParser() + try: + akl = parser.parse(key) + except TypeError: + return False + + return akl.keytype is not None + + +def _partitions_on_device(devpath, maxnum=16): + # return a list of tuples (ptnum, path) for each part on devpath + for suff in ("-part", "p", ""): + found = [] + for pnum in range(1, maxnum): + ppath = devpath + suff + str(pnum) + if os.path.exists(ppath): + found.append((pnum, os.path.realpath(ppath))) + if found: + return found + return [] + + +@azure_ds_telemetry_reporter +def _has_ntfs_filesystem(devpath): + ntfs_devices = util.find_devs_with("TYPE=ntfs", no_cache=True) + LOG.debug("ntfs_devices found = %s", ntfs_devices) + return os.path.realpath(devpath) in ntfs_devices + + +@azure_ds_telemetry_reporter +def can_dev_be_reformatted(devpath, preserve_ntfs): + """Determine if the ephemeral drive at devpath should be reformatted. + + A fresh ephemeral disk is formatted by Azure and will: + a.) have a partition table (dos or gpt) + b.) have 1 partition that is ntfs formatted, or + have 2 partitions with the second partition ntfs formatted. + (larger instances with >2TB ephemeral disk have gpt, and will + have a microsoft reserved partition as part 1. LP: #1686514) + c.) the ntfs partition will have no files other than possibly + 'dataloss_warning_readme.txt' + + User can indicate that NTFS should never be destroyed by setting + DS_CFG_KEY_PRESERVE_NTFS in dscfg. + If data is found on NTFS, user is warned to set DS_CFG_KEY_PRESERVE_NTFS + to make sure cloud-init does not accidentally wipe their data. + If cloud-init cannot mount the disk to check for data, destruction + will be allowed, unless the dscfg key is set.""" + if preserve_ntfs: + msg = "config says to never destroy NTFS (%s.%s), skipping checks" % ( + ".".join(DS_CFG_PATH), + DS_CFG_KEY_PRESERVE_NTFS, + ) + return False, msg + + if not os.path.exists(devpath): + return False, "device %s does not exist" % devpath + + LOG.debug( + "Resolving realpath of %s -> %s", devpath, os.path.realpath(devpath) + ) + + # devpath of /dev/sd[a-z] or /dev/disk/cloud/azure_resource + # where partitions are "1" or "-part1" or "p1" + partitions = _partitions_on_device(devpath) + if not partitions: + return False, "device %s was not partitioned" % devpath + + partition_len = len(partitions) + if partition_len > 2: + msg = "device %s had 3 or more partitions: %s" % ( + devpath, + " ".join([p[1] for p in partitions]), + ) + return False, msg + + if partition_len == 2: + cand_part, cand_path = partitions[1] + else: + cand_part, cand_path = partitions[0] + + if not _has_ntfs_filesystem(cand_path): + msg = "partition %s (%s) on device %s was not ntfs formatted" % ( + cand_part, + cand_path, + devpath, + ) + return False, msg + + @azure_ds_telemetry_reporter + def count_files(mp): + ignored = set( + ["dataloss_warning_readme.txt", "system volume information"] + ) + return len([f for f in os.listdir(mp) if f.lower() not in ignored]) + + bmsg = "partition %s (%s) on device %s was ntfs formatted" % ( + cand_part, + cand_path, + devpath, + ) + + with events.ReportEventStack( + name="mount-ntfs-and-count", + description="mount-ntfs-and-count", + parent=azure_ds_reporter, + ) as evt: + try: + file_count = util.mount_cb( + cand_path, + count_files, + mtype="ntfs", + update_env_for_mount={"LANG": "C"}, + log_error=False, + ) + except util.MountFailedError as e: + evt.description = "cannot mount ntfs" + if "unknown filesystem type 'ntfs'" in str(e): + return ( + True, + ( + bmsg + " but this system cannot mount NTFS," + " assuming there are no important files." + " Formatting allowed." + ), + ) + return False, bmsg + " but mount of %s failed: %s" % (cand_part, e) + + if file_count != 0: + evt.description = "mounted and counted %d files" % file_count + LOG.warning( + "it looks like you're using NTFS on the ephemeral" + " disk, to ensure that filesystem does not get wiped," + " set %s.%s in config", + ".".join(DS_CFG_PATH), + DS_CFG_KEY_PRESERVE_NTFS, + ) + return False, bmsg + " but had %d files on it." % file_count + + return True, bmsg + " and had no important files. Safe for reformatting." + + +@azure_ds_telemetry_reporter +def address_ephemeral_resize( + instance_dir: str, + devpath: str = RESOURCE_DISK_PATH, + is_new_instance: bool = False, + preserve_ntfs: bool = False, +): + if not os.path.exists(devpath): + report_diagnostic_event( + "Ephemeral resource disk '%s' does not exist." % devpath, + logger_func=LOG.debug, + ) + return + else: + report_diagnostic_event( + "Ephemeral resource disk '%s' exists." % devpath, + logger_func=LOG.debug, + ) + + result = False + msg = None + if is_new_instance: + result, msg = (True, "First instance boot.") + else: + result, msg = can_dev_be_reformatted(devpath, preserve_ntfs) + + LOG.debug("reformattable=%s: %s", result, msg) + if not result: + return + + for mod in ["disk_setup", "mounts"]: + sempath = os.path.join(instance_dir, "sem", "config_" + mod) + bmsg = 'Marker "%s" for module "%s"' % (sempath, mod) + if os.path.exists(sempath): + try: + os.unlink(sempath) + LOG.debug("%s removed.", bmsg) + except FileNotFoundError as e: + LOG.warning("%s: remove failed! (%s)", bmsg, e) + else: + LOG.debug("%s did not exist.", bmsg) + return + + +@azure_ds_telemetry_reporter +def write_files(datadir, files, dirmode=None): + def _redact_password(cnt, fname): + """Azure provides the UserPassword in plain text. So we redact it""" + try: + root = ET.fromstring(cnt) # nosec B314 + for elem in root.iter(): + if ( + "UserPassword" in elem.tag + and elem.text != DEF_PASSWD_REDACTION + ): + elem.text = DEF_PASSWD_REDACTION + return ET.tostring(root) + except Exception: + LOG.critical("failed to redact userpassword in %s", fname) + return cnt + + if not datadir: + return + if not files: + files = {} + util.ensure_dir(datadir, dirmode) + for name, content in files.items(): + fname = os.path.join(datadir, name) + if "ovf-env.xml" in name: + content = _redact_password(content, fname) + util.write_file(filename=fname, content=content, mode=0o600) + + +@azure_ds_telemetry_reporter +def read_azure_ovf(contents): + """Parse OVF XML contents. + + :return: Tuple of metadata, configuration, userdata dicts. + + :raises NonAzureDataSource: if XML is not in Azure's format. + :raises errors.ReportableError: if XML is unparsable or invalid. + """ + ovf_env = OvfEnvXml.parse_text(contents) + md: Dict[str, Any] = {} + cfg = {} + ud = ovf_env.custom_data or "" + + if ovf_env.hostname: + md["local-hostname"] = ovf_env.hostname + + if ovf_env.public_keys: + cfg["_pubkeys"] = ovf_env.public_keys + + if ovf_env.disable_ssh_password_auth is not None: + cfg["ssh_pwauth"] = not ovf_env.disable_ssh_password_auth + elif ovf_env.password: + cfg["ssh_pwauth"] = True + + defuser = {} + if ovf_env.username: + defuser["name"] = ovf_env.username + if ovf_env.password: + defuser["lock_passwd"] = False + if DEF_PASSWD_REDACTION != ovf_env.password: + defuser["hashed_passwd"] = encrypt_pass(ovf_env.password) + + if defuser: + cfg["system_info"] = {"default_user": defuser} + + cfg["PreprovisionedVm"] = ovf_env.preprovisioned_vm + report_diagnostic_event( + "PreprovisionedVm: %s" % ovf_env.preprovisioned_vm, + logger_func=LOG.info, + ) + + cfg["PreprovisionedVMType"] = ovf_env.preprovisioned_vm_type + report_diagnostic_event( + "PreprovisionedVMType: %s" % ovf_env.preprovisioned_vm_type, + logger_func=LOG.info, + ) + + cfg["ProvisionGuestProxyAgent"] = ovf_env.provision_guest_proxy_agent + report_diagnostic_event( + "ProvisionGuestProxyAgent: %s" % ovf_env.provision_guest_proxy_agent, + logger_func=LOG.info, + ) + return (md, ud, cfg) + + +def encrypt_pass(password): + return blowfish_hash(password) + + +def find_primary_nic(): + candidate_nics = net.find_candidate_nics() + if candidate_nics: + return candidate_nics[0] + + return None + + +@azure_ds_telemetry_reporter +def _check_freebsd_cdrom(cdrom_dev): + """Return boolean indicating path to cdrom device has content.""" + try: + with open(cdrom_dev) as fp: + fp.read(1024) + return True + except IOError: + LOG.debug("cdrom (%s) is not configured", cdrom_dev) + return False + + +@azure_ds_telemetry_reporter +def _get_random_seed(source=PLATFORM_ENTROPY_SOURCE): + """Return content random seed file if available, otherwise, + return None.""" + # azure / hyper-v provides random data here + # now update ds_cfg to reflect contents pass in config + if source is None: + return None + seed = util.load_binary_file(source, quiet=True) + + # The seed generally contains non-Unicode characters. load_binary_file puts + # them into bytes (in python 3). + # bytes is a non-serializable type, and the handler + # used applies b64 encoding *again* to handle it. The simplest solution + # is to just b64encode the data and then decode it to a serializable + # string. Same number of bits of entropy, just with 25% more zeroes. + # There's no need to undo this base64-encoding when the random seed is + # actually used in cc_seed_random.py. + return base64.b64encode(seed).decode() # pyright: ignore + + +@azure_ds_telemetry_reporter +def list_possible_azure_ds(seed, cache_dir): + yield seed + yield DEFAULT_PROVISIONING_ISO_DEV + if util.is_FreeBSD(): + cdrom_dev = "/dev/cd0" + if _check_freebsd_cdrom(cdrom_dev): + yield cdrom_dev + else: + for fstype in ("iso9660", "udf"): + yield from util.find_devs_with("TYPE=%s" % fstype) + if cache_dir: + yield cache_dir + + +@azure_ds_telemetry_reporter +def load_azure_ds_dir(source_dir): + ovf_file = os.path.join(source_dir, "ovf-env.xml") + + if not os.path.isfile(ovf_file): + raise NonAzureDataSource("No ovf-env file found") + + with performance.Timed("Reading ovf-env.xml"), open(ovf_file, "rb") as fp: + contents = fp.read() + + md, ud, cfg = read_azure_ovf(contents) + return (md, ud, cfg, {"ovf-env.xml": contents}) + + +@azure_ds_telemetry_reporter +def generate_network_config_from_instance_network_metadata( + network_metadata: dict, + *, + apply_network_config_for_secondary_ips: bool, +) -> dict: + """Convert imds network metadata dictionary to network v2 configuration. + + :param: network_metadata: Dict of "network" key from instance metadata. + + :return: Dictionary containing network version 2 standard configuration. + """ + netconfig: Dict[str, Any] = {"version": 2, "ethernets": {}} + for idx, intf in enumerate(network_metadata["interface"]): + has_ip_address = False + # First IPv4 and/or IPv6 address will be obtained via DHCP. + # Any additional IPs of each type will be set as static + # addresses. + nicname = "eth{idx}".format(idx=idx) + dhcp_override = {"route-metric": (idx + 1) * 100} + # DNS resolution through secondary NICs is not supported, disable it. + if idx > 0: + dhcp_override["use-dns"] = False + dev_config: Dict[str, Any] = { + "dhcp4": True, + "dhcp4-overrides": dhcp_override, + "dhcp6": False, + } + for addr_type in ("ipv4", "ipv6"): + addresses = intf.get(addr_type, {}).get("ipAddress", []) + # If there are no available IP addresses, then we don't + # want to add this interface to the generated config. + if not addresses: + LOG.debug("No %s addresses found for: %r", addr_type, intf) + continue + has_ip_address = True + if addr_type == "ipv4": + default_prefix = "24" + else: + default_prefix = "128" + if addresses: + dev_config["dhcp6"] = True + # non-primary interfaces should have a higher + # route-metric (cost) so default routes prefer + # primary nic due to lower route-metric value + dev_config["dhcp6-overrides"] = dhcp_override + + if not apply_network_config_for_secondary_ips: + continue + + for addr in addresses[1:]: + # Append static address config for ip > 1 + netPrefix = intf[addr_type]["subnet"][0].get( + "prefix", default_prefix + ) + privateIp = addr["privateIpAddress"] + if not dev_config.get("addresses"): + dev_config["addresses"] = [] + dev_config["addresses"].append( + "{ip}/{prefix}".format(ip=privateIp, prefix=netPrefix) + ) + if dev_config and has_ip_address: + mac = normalize_mac_address(intf["macAddress"]) + dev_config.update( + {"match": {"macaddress": mac.lower()}, "set-name": nicname} + ) + driver = determine_device_driver_for_mac(mac) + if driver: + dev_config["match"]["driver"] = driver + netconfig["ethernets"][nicname] = dev_config + continue + + LOG.debug( + "No configuration for: %s (dev_config=%r) (has_ip_address=%r)", + nicname, + dev_config, + has_ip_address, + ) + return netconfig + + +@azure_ds_telemetry_reporter +def _generate_network_config_from_fallback_config() -> dict: + """Generate fallback network config. + + @return: Dictionary containing network version 2 standard configuration. + """ + cfg = net.generate_fallback_config(config_driver=True) + if cfg is None: + return {} + return cfg + + +@azure_ds_telemetry_reporter +def maybe_remove_ubuntu_network_config_scripts(paths=None): + """Remove Azure-specific ubuntu network config for non-primary nics. + + @param paths: List of networking scripts or directories to remove when + present. + + In certain supported ubuntu images, static udev rules or netplan yaml + config is delivered in the base ubuntu image to support dhcp on any + additional interfaces which get attached by a customer at some point + after initial boot. Since the Azure datasource can now regenerate + network configuration as metadata reports these new devices, we no longer + want the udev rules or netplan's 90-hotplug-azure.yaml to configure + networking on eth1 or greater as it might collide with cloud-init's + configuration. + + Remove the any existing extended network scripts if the datasource is + enabled to write network per-boot. + """ + if not paths: + paths = UBUNTU_EXTENDED_NETWORK_SCRIPTS + logged = False + for path in paths: + if os.path.exists(path): + if not logged: + LOG.info( + "Removing Ubuntu extended network scripts because" + " cloud-init updates Azure network configuration on the" + " following events: %s.", + [EventType.BOOT.value, EventType.BOOT_LEGACY.value], + ) + logged = True + if os.path.isdir(path): + util.del_dir(path) + else: + util.del_file(path) + + +# Legacy: Must be present in case we load an old pkl object +DataSourceAzureNet = DataSourceAzure + +# Used to match classes to dependencies +datasources = [ + (DataSourceAzure, (sources.DEP_FILESYSTEM,)), +] + + +# Return a list of data sources that match this set of dependencies +def get_datasource_list(depends): + return sources.list_from_depends(depends, datasources) + + +def get_interface_details( + iface: Optional[str], +) -> Tuple[Optional[str], Optional[str]]: + if iface is None: + return None, None + + interfaces = net.get_interfaces() + for interface_name, interface_mac, interface_driver, _ in interfaces: + if interface_name == iface: + return interface_mac, interface_driver + + return None, None diff --git a/vhdbuilder/packer/vhd-image-builder-acl.json b/vhdbuilder/packer/vhd-image-builder-acl.json index 03adb0f11f0..7488062c5cd 100644 --- a/vhdbuilder/packer/vhd-image-builder-acl.json +++ b/vhdbuilder/packer/vhd-image-builder-acl.json @@ -221,6 +221,11 @@ "source": "parts/linux/cloud-init/artifacts/cse_send_logs.py", "destination": "/home/packer/cse_send_logs.py" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/report_ready.py", + "destination": "/home/packer/report_ready.py" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json index 615da5e9ee3..870a9b63601 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json @@ -232,6 +232,11 @@ "source": "parts/linux/cloud-init/artifacts/cse_send_logs.py", "destination": "/home/packer/cse_send_logs.py" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/report_ready.py", + "destination": "/home/packer/report_ready.py" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-base.json b/vhdbuilder/packer/vhd-image-builder-base.json index bfe60f33041..6834936d9be 100644 --- a/vhdbuilder/packer/vhd-image-builder-base.json +++ b/vhdbuilder/packer/vhd-image-builder-base.json @@ -115,6 +115,11 @@ "source": "parts/linux/cloud-init/artifacts/cloud-init-status-check.sh", "destination": "/home/packer/cloud-init-status-check.sh" }, + { + "type": "file", + "source": "vhdbuilder/packer/ubuntu/cloudinit/DataSourceAzure.py", + "destination": "/home/packer/DataSourceAzure.py" + }, { "type": "file", "source": "vhdbuilder/packer/prefetch.sh", @@ -235,6 +240,11 @@ "source": "parts/linux/cloud-init/artifacts/cse_send_logs.py", "destination": "/home/packer/cse_send_logs.py" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/report_ready.py", + "destination": "/home/packer/report_ready.py" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-cvm.json b/vhdbuilder/packer/vhd-image-builder-cvm.json index 0e444781783..96d9e16cc5b 100644 --- a/vhdbuilder/packer/vhd-image-builder-cvm.json +++ b/vhdbuilder/packer/vhd-image-builder-cvm.json @@ -239,6 +239,11 @@ "source": "parts/linux/cloud-init/artifacts/cse_send_logs.py", "destination": "/home/packer/cse_send_logs.py" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/report_ready.py", + "destination": "/home/packer/report_ready.py" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-flatcar-arm64.json b/vhdbuilder/packer/vhd-image-builder-flatcar-arm64.json index 203a22dc035..8e9bf8736c6 100644 --- a/vhdbuilder/packer/vhd-image-builder-flatcar-arm64.json +++ b/vhdbuilder/packer/vhd-image-builder-flatcar-arm64.json @@ -223,6 +223,11 @@ "source": "parts/linux/cloud-init/artifacts/cse_send_logs.py", "destination": "/home/packer/cse_send_logs.py" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/report_ready.py", + "destination": "/home/packer/report_ready.py" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-flatcar.json b/vhdbuilder/packer/vhd-image-builder-flatcar.json index 959d78535d9..9e2c58defcb 100644 --- a/vhdbuilder/packer/vhd-image-builder-flatcar.json +++ b/vhdbuilder/packer/vhd-image-builder-flatcar.json @@ -223,6 +223,11 @@ "source": "parts/linux/cloud-init/artifacts/cse_send_logs.py", "destination": "/home/packer/cse_send_logs.py" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/report_ready.py", + "destination": "/home/packer/report_ready.py" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-mariner-arm64.json b/vhdbuilder/packer/vhd-image-builder-mariner-arm64.json index 6ed96281c5c..cba6b4b532a 100644 --- a/vhdbuilder/packer/vhd-image-builder-mariner-arm64.json +++ b/vhdbuilder/packer/vhd-image-builder-mariner-arm64.json @@ -231,6 +231,11 @@ "source": "parts/linux/cloud-init/artifacts/cse_send_logs.py", "destination": "/home/packer/cse_send_logs.py" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/report_ready.py", + "destination": "/home/packer/report_ready.py" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-mariner-cvm.json b/vhdbuilder/packer/vhd-image-builder-mariner-cvm.json index e4d58283d56..762ef5f3dd3 100644 --- a/vhdbuilder/packer/vhd-image-builder-mariner-cvm.json +++ b/vhdbuilder/packer/vhd-image-builder-mariner-cvm.json @@ -237,6 +237,11 @@ "source": "parts/linux/cloud-init/artifacts/cse_send_logs.py", "destination": "/home/packer/cse_send_logs.py" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/report_ready.py", + "destination": "/home/packer/report_ready.py" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-mariner.json b/vhdbuilder/packer/vhd-image-builder-mariner.json index 3fd5e90a8b3..75f9d707e5d 100644 --- a/vhdbuilder/packer/vhd-image-builder-mariner.json +++ b/vhdbuilder/packer/vhd-image-builder-mariner.json @@ -233,6 +233,11 @@ "source": "parts/linux/cloud-init/artifacts/cse_send_logs.py", "destination": "/home/packer/cse_send_logs.py" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/report_ready.py", + "destination": "/home/packer/report_ready.py" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh",